zd21's picture
Upload folder using huggingface_hub
09951d0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8591885441527446,
"eval_steps": 10,
"global_step": 360,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 1852.119140625,
"epoch": 0.002386634844868735,
"grad_norm": 0.24904002787956508,
"kl": 0.0,
"learning_rate": 7.692307692307692e-08,
"loss": 0.0859,
"reward": 0.9985864162445068,
"reward_std": 0.8023478388786316,
"rewards/": 5.9453125,
"rewards/math_compute_score": -0.2380952388048172,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 1857.7381591796875,
"epoch": 0.00477326968973747,
"grad_norm": 0.24734786328196506,
"kl": 0.0,
"learning_rate": 1.5384615384615385e-07,
"loss": 0.0801,
"reward": 1.084356427192688,
"reward_std": 0.8982762098312378,
"rewards/": 5.802734375,
"rewards/math_compute_score": -0.095238097012043,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 1849.666748046875,
"epoch": 0.007159904534606206,
"grad_norm": 0.2778667928712439,
"kl": 9.202957153320312e-05,
"learning_rate": 2.3076923076923078e-07,
"loss": 0.0626,
"reward": 1.2749255895614624,
"reward_std": 1.0164073705673218,
"rewards/": 5.803199291229248,
"rewards/math_compute_score": 0.1428571492433548,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 1977.166748046875,
"epoch": 0.00954653937947494,
"grad_norm": 0.21684956038503114,
"kl": 6.771087646484375e-05,
"learning_rate": 3.076923076923077e-07,
"loss": 0.021,
"reward": 0.7873699069023132,
"reward_std": 0.7230538725852966,
"rewards/": 6.032087326049805,
"rewards/math_compute_score": -0.523809552192688,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 1939.40478515625,
"epoch": 0.011933174224343675,
"grad_norm": 0.24259770313581716,
"kl": 9.012222290039062e-05,
"learning_rate": 3.8461538461538463e-07,
"loss": 0.0349,
"reward": 1.0736374855041504,
"reward_std": 0.8964518904685974,
"rewards/": 6.130092144012451,
"rewards/math_compute_score": -0.190476194024086,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 2030.21435546875,
"epoch": 0.014319809069212411,
"grad_norm": 0.23532745486358006,
"kl": 0.000102996826171875,
"learning_rate": 4.6153846153846156e-07,
"loss": 0.0077,
"reward": 0.669866144657135,
"reward_std": 0.6640447378158569,
"rewards/": 6.015996932983398,
"rewards/math_compute_score": -0.6666666865348816,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 1837.047607421875,
"epoch": 0.016706443914081145,
"grad_norm": 0.24695210299782985,
"kl": 7.62939453125e-05,
"learning_rate": 5.384615384615384e-07,
"loss": 0.0643,
"reward": 1.1588542461395264,
"reward_std": 0.8526113629341125,
"rewards/": 5.794270992279053,
"rewards/math_compute_score": 0.0,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 1710.7857666015625,
"epoch": 0.01909307875894988,
"grad_norm": 0.2635039041467129,
"kl": 0.00010061264038085938,
"learning_rate": 6.153846153846154e-07,
"loss": 0.0352,
"reward": 1.4391371011734009,
"reward_std": 0.6852283477783203,
"rewards/": 6.624256134033203,
"rewards/math_compute_score": 0.1428571492433548,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 1951.8095703125,
"epoch": 0.021479713603818614,
"grad_norm": 0.22619335380819766,
"kl": 7.82012939453125e-05,
"learning_rate": 6.923076923076922e-07,
"loss": 0.0341,
"reward": 0.9593006372451782,
"reward_std": 0.7629837989807129,
"rewards/": 6.129836559295654,
"rewards/math_compute_score": -0.3333333432674408,
"step": 9
},
{
"epoch": 0.02386634844868735,
"grad_norm": 0.2507566093873831,
"learning_rate": 7.692307692307693e-07,
"loss": 0.026,
"step": 10
},
{
"epoch": 0.02386634844868735,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1909.1607666015625,
"eval_kl": 9.047985076904297e-05,
"eval_loss": 0.034137628972530365,
"eval_reward": 1.143368422985077,
"eval_reward_std": 0.7167749404907227,
"eval_rewards/": 6.193032503128052,
"eval_rewards/math_compute_score": -0.11904762079939246,
"eval_runtime": 94.5877,
"eval_samples_per_second": 0.222,
"eval_steps_per_second": 0.011,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 1677.8928833007812,
"epoch": 0.026252983293556086,
"grad_norm": 0.2771089455944722,
"kl": 0.00010251998901367188,
"learning_rate": 8.461538461538461e-07,
"loss": 0.0246,
"reward": 1.308068335056305,
"reward_std": 0.6968488693237305,
"rewards/": 5.778436660766602,
"rewards/math_compute_score": 0.19047619588673115,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 1728.4285888671875,
"epoch": 0.028639618138424822,
"grad_norm": 0.27092892547921155,
"kl": 0.0001068115234375,
"learning_rate": 9.230769230769231e-07,
"loss": 0.0862,
"reward": 1.2834821939468384,
"reward_std": 0.9967786073684692,
"rewards/": 5.750744342803955,
"rewards/math_compute_score": 0.1666666716337204,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 1984.619140625,
"epoch": 0.031026252983293555,
"grad_norm": 0.22856130082794326,
"kl": 9.012222290039062e-05,
"learning_rate": 1e-06,
"loss": 0.038,
"reward": 0.910714328289032,
"reward_std": 0.7129672169685364,
"rewards/": 6.458333492279053,
"rewards/math_compute_score": -0.4761904776096344,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 1827.3095703125,
"epoch": 0.03341288782816229,
"grad_norm": 0.2724826324747265,
"kl": 0.00011968612670898438,
"learning_rate": 9.99985031250522e-07,
"loss": 0.014,
"reward": 1.2249256372451782,
"reward_std": 0.595586895942688,
"rewards/": 6.2198662757873535,
"rewards/math_compute_score": -0.02380952425301075,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 1637.6905517578125,
"epoch": 0.03579952267303103,
"grad_norm": 0.27133525373007367,
"kl": 9.775161743164062e-05,
"learning_rate": 9.999401258983425e-07,
"loss": 0.0523,
"reward": 1.1956148147583008,
"reward_std": 0.6316924691200256,
"rewards/": 5.692359447479248,
"rewards/math_compute_score": 0.0714285746216774,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 1857.0,
"epoch": 0.03818615751789976,
"grad_norm": 0.25395778947044934,
"kl": 9.822845458984375e-05,
"learning_rate": 9.998652866321687e-07,
"loss": 0.0322,
"reward": 1.1026042699813843,
"reward_std": 0.5841531157493591,
"rewards/": 6.8463544845581055,
"rewards/math_compute_score": -0.3333333432674408,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 1820.2857666015625,
"epoch": 0.0405727923627685,
"grad_norm": 0.27104243767092423,
"kl": 0.00010204315185546875,
"learning_rate": 9.997605179330017e-07,
"loss": 0.0587,
"reward": 1.2473958730697632,
"reward_std": 0.9411390423774719,
"rewards/": 6.332217216491699,
"rewards/math_compute_score": -0.02380952425301075,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 2032.6905517578125,
"epoch": 0.04295942720763723,
"grad_norm": 0.2425790646876099,
"kl": 9.679794311523438e-05,
"learning_rate": 9.996258260738674e-07,
"loss": 0.0062,
"reward": 1.0361607074737549,
"reward_std": 0.7374575734138489,
"rewards/": 6.704613208770752,
"rewards/math_compute_score": -0.380952388048172,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 1860.5238037109375,
"epoch": 0.045346062052505964,
"grad_norm": 0.2526155285730889,
"kl": 9.393692016601562e-05,
"learning_rate": 9.994612191194405e-07,
"loss": 0.0504,
"reward": 1.2767857313156128,
"reward_std": 0.6392523646354675,
"rewards/": 6.19345235824585,
"rewards/math_compute_score": 0.0476190485060215,
"step": 19
},
{
"epoch": 0.0477326968973747,
"grad_norm": 0.2019998878404815,
"learning_rate": 9.992667069255618e-07,
"loss": 0.0563,
"step": 20
},
{
"epoch": 0.0477326968973747,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1916.7678833007812,
"eval_kl": 8.654594421386719e-05,
"eval_loss": 0.045527394860982895,
"eval_reward": 1.1353678554296494,
"eval_reward_std": 0.7787726670503616,
"eval_rewards/": 6.105410695075989,
"eval_rewards/math_compute_score": -0.10714286100119352,
"eval_runtime": 94.5472,
"eval_samples_per_second": 0.222,
"eval_steps_per_second": 0.011,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 1885.857177734375,
"epoch": 0.050119331742243436,
"grad_norm": 0.23793932188276107,
"kl": 7.82012939453125e-05,
"learning_rate": 9.990423011386488e-07,
"loss": 0.0364,
"reward": 1.1678432822227478,
"reward_std": 0.7990120947360992,
"rewards/": 6.220168590545654,
"rewards/math_compute_score": -0.09523809887468815,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 1924.619140625,
"epoch": 0.05250596658711217,
"grad_norm": 0.24652779541561778,
"kl": 9.965896606445312e-05,
"learning_rate": 9.987880151949975e-07,
"loss": 0.0332,
"reward": 0.9929687976837158,
"reward_std": 0.6396169066429138,
"rewards/": 5.917224884033203,
"rewards/math_compute_score": -0.2380952388048172,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 1572.8095703125,
"epoch": 0.05489260143198091,
"grad_norm": 0.2535111310244703,
"kl": 7.486343383789062e-05,
"learning_rate": 9.985038643199778e-07,
"loss": 0.0609,
"reward": 1.378557562828064,
"reward_std": 0.7171154618263245,
"rewards/": 4.892787456512451,
"rewards/math_compute_score": 0.5,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 1634.3095703125,
"epoch": 0.057279236276849645,
"grad_norm": 0.28708136371156134,
"kl": 9.822845458984375e-05,
"learning_rate": 9.981898655271234e-07,
"loss": 0.0954,
"reward": 1.303076148033142,
"reward_std": 0.84552401304245,
"rewards/": 5.658237934112549,
"rewards/math_compute_score": 0.2142857164144516,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 1806.9761962890625,
"epoch": 0.059665871121718374,
"grad_norm": 0.23136332868998175,
"kl": 8.58306884765625e-05,
"learning_rate": 9.978460376171112e-07,
"loss": 0.0437,
"reward": 1.099237322807312,
"reward_std": 0.5548410415649414,
"rewards/": 6.258091449737549,
"rewards/math_compute_score": -0.190476194024086,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 1635.0,
"epoch": 0.06205250596658711,
"grad_norm": 0.27536984991640817,
"kl": 0.00011491775512695312,
"learning_rate": 9.974724011766361e-07,
"loss": 0.0625,
"reward": 1.6150113344192505,
"reward_std": 0.5635201930999756,
"rewards/": 6.456008434295654,
"rewards/math_compute_score": 0.4047619104385376,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 1988.5,
"epoch": 0.06443914081145585,
"grad_norm": 0.22513198771984447,
"kl": 8.0108642578125e-05,
"learning_rate": 9.970689785771798e-07,
"loss": 0.0169,
"reward": 1.2694941759109497,
"reward_std": 0.6696727275848389,
"rewards/": 6.442708492279053,
"rewards/math_compute_score": -0.02380952425301075,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 1631.1429443359375,
"epoch": 0.06682577565632458,
"grad_norm": 0.2269622871051312,
"kl": 7.2479248046875e-05,
"learning_rate": 9.96635793973669e-07,
"loss": 0.0095,
"reward": 1.7527531385421753,
"reward_std": 0.3711332380771637,
"rewards/": 6.76376485824585,
"rewards/math_compute_score": 0.5,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 1927.3095703125,
"epoch": 0.06921241050119331,
"grad_norm": 0.2768313841282757,
"kl": 0.00011539459228515625,
"learning_rate": 9.961728733030316e-07,
"loss": 0.0523,
"reward": 1.046147346496582,
"reward_std": 0.8631386756896973,
"rewards/": 5.992640972137451,
"rewards/math_compute_score": -0.190476194024086,
"step": 29
},
{
"epoch": 0.07159904534606205,
"grad_norm": 0.22336747996927972,
"learning_rate": 9.956802442826415e-07,
"loss": -0.0099,
"step": 30
},
{
"epoch": 0.07159904534606205,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1907.7559814453125,
"eval_kl": 8.702278137207031e-05,
"eval_loss": 0.05547888204455376,
"eval_reward": 1.119970753788948,
"eval_reward_std": 0.7313214838504791,
"eval_rewards/": 6.195091724395752,
"eval_rewards/math_compute_score": -0.14880952518433332,
"eval_runtime": 94.0737,
"eval_samples_per_second": 0.223,
"eval_steps_per_second": 0.011,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 1920.3095703125,
"epoch": 0.07398568019093078,
"grad_norm": 0.23160944462486585,
"kl": 7.987022399902344e-05,
"learning_rate": 9.951579364086603e-07,
"loss": 0.0088,
"reward": 1.1973958909511566,
"reward_std": 0.7134246528148651,
"rewards/": 6.367931842803955,
"rewards/math_compute_score": -0.095238097012043,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 1838.6429443359375,
"epoch": 0.07637231503579953,
"grad_norm": 0.24487657247172945,
"kl": 9.202957153320312e-05,
"learning_rate": 9.946059809542706e-07,
"loss": 0.0476,
"reward": 1.3777530193328857,
"reward_std": 0.5670939683914185,
"rewards/": 6.222098350524902,
"rewards/math_compute_score": 0.1666666716337204,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 1876.8095703125,
"epoch": 0.07875894988066826,
"grad_norm": 0.2506016708667217,
"kl": 7.772445678710938e-05,
"learning_rate": 9.940244109678041e-07,
"loss": 0.0588,
"reward": 1.0940476655960083,
"reward_std": 0.597551167011261,
"rewards/": 6.803571701049805,
"rewards/math_compute_score": -0.3333333432674408,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 1674.4761962890625,
"epoch": 0.081145584725537,
"grad_norm": 0.25999785701346345,
"kl": 7.772445678710938e-05,
"learning_rate": 9.93413261270763e-07,
"loss": 0.0355,
"reward": 1.716294765472412,
"reward_std": 0.4909428358078003,
"rewards/": 6.48623514175415,
"rewards/math_compute_score": 0.523809552192688,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 1891.5,
"epoch": 0.08353221957040573,
"grad_norm": 0.26237200501370733,
"kl": 9.72747802734375e-05,
"learning_rate": 9.927725684557339e-07,
"loss": 0.0563,
"reward": 1.2531064748764038,
"reward_std": 0.7479419112205505,
"rewards/": 6.075056076049805,
"rewards/math_compute_score": 0.0476190485060215,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 1857.59521484375,
"epoch": 0.08591885441527446,
"grad_norm": 0.23853807456535067,
"kl": 6.198883056640625e-05,
"learning_rate": 9.921023708841973e-07,
"loss": 0.0385,
"reward": 1.121354103088379,
"reward_std": 0.6391122341156006,
"rewards/": 6.368675708770752,
"rewards/math_compute_score": -0.190476194024086,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 1826.8333740234375,
"epoch": 0.0883054892601432,
"grad_norm": 0.2518634206321752,
"kl": 8.630752563476562e-05,
"learning_rate": 9.914027086842322e-07,
"loss": 0.0281,
"reward": 1.080822229385376,
"reward_std": 0.7901343703269958,
"rewards/": 6.166015625,
"rewards/math_compute_score": -0.190476194024086,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 1974.761962890625,
"epoch": 0.09069212410501193,
"grad_norm": 0.21874504590315716,
"kl": 8.106231689453125e-05,
"learning_rate": 9.906736237481108e-07,
"loss": 0.0347,
"reward": 0.7355794310569763,
"reward_std": 0.943155825138092,
"rewards/": 5.01123046875,
"rewards/math_compute_score": -0.3333333432674408,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 1853.9761962890625,
"epoch": 0.09307875894988067,
"grad_norm": 0.2636078943840054,
"kl": 0.00010204315185546875,
"learning_rate": 9.899151597297922e-07,
"loss": 0.0061,
"reward": 0.7527158260345459,
"reward_std": 0.5094756484031677,
"rewards/": 5.76357889175415,
"rewards/math_compute_score": -0.5,
"step": 39
},
{
"epoch": 0.0954653937947494,
"grad_norm": 0.22442688552857057,
"learning_rate": 9.891273620423082e-07,
"loss": 0.0218,
"step": 40
},
{
"epoch": 0.0954653937947494,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1899.7857666015625,
"eval_kl": 7.843971252441406e-05,
"eval_loss": 0.040552277117967606,
"eval_reward": 1.2572033554315567,
"eval_reward_std": 0.8417278081178665,
"eval_rewards/": 6.166969180107117,
"eval_rewards/math_compute_score": 0.02976190485060215,
"eval_runtime": 94.3574,
"eval_samples_per_second": 0.223,
"eval_steps_per_second": 0.011,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 1945.3333129882812,
"epoch": 0.09785202863961814,
"grad_norm": 0.25352200487817333,
"kl": 7.82012939453125e-05,
"learning_rate": 9.883102778550434e-07,
"loss": 0.0225,
"reward": 1.16990327835083,
"reward_std": 0.6465564370155334,
"rewards/": 6.659040212631226,
"rewards/math_compute_score": -0.2023809514939785,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 1662.452392578125,
"epoch": 0.10023866348448687,
"grad_norm": 0.29366924279276885,
"kl": 6.151199340820312e-05,
"learning_rate": 9.874639560909118e-07,
"loss": 0.0167,
"reward": 1.2738094329833984,
"reward_std": 0.7358887791633606,
"rewards/": 5.702381134033203,
"rewards/math_compute_score": 0.1666666716337204,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 1590.8333740234375,
"epoch": 0.1026252983293556,
"grad_norm": 0.2978512257146081,
"kl": 9.72747802734375e-05,
"learning_rate": 9.865884474234275e-07,
"loss": 0.068,
"reward": 1.3785715103149414,
"reward_std": 0.6778793334960938,
"rewards/": 5.75,
"rewards/math_compute_score": 0.2857142984867096,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 1981.261962890625,
"epoch": 0.10501193317422435,
"grad_norm": 0.2557952927437338,
"kl": 0.0001068115234375,
"learning_rate": 9.856838042736696e-07,
"loss": 0.015,
"reward": 0.920479953289032,
"reward_std": 0.9247375726699829,
"rewards/": 6.126209259033203,
"rewards/math_compute_score": -0.380952388048172,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 1779.261962890625,
"epoch": 0.10739856801909307,
"grad_norm": 0.24475320888510424,
"kl": 5.7220458984375e-05,
"learning_rate": 9.847500808071456e-07,
"loss": 0.029,
"reward": 1.5369793176651,
"reward_std": 0.4876500964164734,
"rewards/": 6.161086559295654,
"rewards/math_compute_score": 0.380952388048172,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 1841.90478515625,
"epoch": 0.10978520286396182,
"grad_norm": 0.27708975756974613,
"kl": 0.00010347366333007812,
"learning_rate": 9.837873329305457e-07,
"loss": 0.0135,
"reward": 0.7778274416923523,
"reward_std": 0.5871782898902893,
"rewards/": 5.60342264175415,
"rewards/math_compute_score": -0.4285714328289032,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 1795.119140625,
"epoch": 0.11217183770883055,
"grad_norm": 0.24633285234690444,
"kl": 7.915496826171875e-05,
"learning_rate": 9.82795618288397e-07,
"loss": 0.0583,
"reward": 1.1924666166305542,
"reward_std": 0.8271605372428894,
"rewards/": 5.390903949737549,
"rewards/math_compute_score": 0.1428571492433548,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 1873.6429443359375,
"epoch": 0.11455847255369929,
"grad_norm": 0.23871306735217734,
"kl": 8.726119995117188e-05,
"learning_rate": 9.817749962596114e-07,
"loss": 0.0365,
"reward": 1.1016182899475098,
"reward_std": 0.7240482568740845,
"rewards/": 6.650949001312256,
"rewards/math_compute_score": -0.2857142984867096,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 1836.5238037109375,
"epoch": 0.11694510739856802,
"grad_norm": 0.2602991840828808,
"kl": 7.200241088867188e-05,
"learning_rate": 9.807255279539312e-07,
"loss": 0.0571,
"reward": 1.3672620058059692,
"reward_std": 0.6877846121788025,
"rewards/": 6.550595283508301,
"rewards/math_compute_score": 0.0714285746216774,
"step": 49
},
{
"epoch": 0.11933174224343675,
"grad_norm": 0.257239746248728,
"learning_rate": 9.796472762082685e-07,
"loss": 0.1056,
"step": 50
},
{
"epoch": 0.11933174224343675,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1877.7976379394531,
"eval_kl": 8.618831634521484e-05,
"eval_loss": 0.0589974969625473,
"eval_reward": 1.3616862893104553,
"eval_reward_std": 0.7793268263339996,
"eval_rewards/": 6.427478790283203,
"eval_rewards/math_compute_score": 0.095238097012043,
"eval_runtime": 96.4379,
"eval_samples_per_second": 0.218,
"eval_steps_per_second": 0.01,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 1820.0833740234375,
"epoch": 0.12171837708830549,
"grad_norm": 0.2573216950537411,
"kl": 9.870529174804688e-05,
"learning_rate": 9.785403055829448e-07,
"loss": 0.0382,
"reward": 1.4516844153404236,
"reward_std": 0.7663153111934662,
"rewards/": 6.115564346313477,
"rewards/math_compute_score": 0.2857142798602581,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 1893.261962890625,
"epoch": 0.12410501193317422,
"grad_norm": 0.25066924022690434,
"kl": 0.00010347366333007812,
"learning_rate": 9.77404682357824e-07,
"loss": 0.0493,
"reward": 1.5894160270690918,
"reward_std": 0.8626825213432312,
"rewards/": 7.185174942016602,
"rewards/math_compute_score": 0.190476194024086,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 1847.4285888671875,
"epoch": 0.12649164677804295,
"grad_norm": 0.26985384617928543,
"kl": 0.000110626220703125,
"learning_rate": 9.762404745283437e-07,
"loss": 0.081,
"reward": 0.982366144657135,
"reward_std": 0.8509321808815002,
"rewards/": 5.67373514175415,
"rewards/math_compute_score": -0.190476194024086,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 1893.666748046875,
"epoch": 0.1288782816229117,
"grad_norm": 0.23216642387230993,
"kl": 9.775161743164062e-05,
"learning_rate": 9.75047751801446e-07,
"loss": 0.0562,
"reward": 1.1761904954910278,
"reward_std": 0.5184506773948669,
"rewards/": 5.976190567016602,
"rewards/math_compute_score": -0.02380952425301075,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 1542.6190185546875,
"epoch": 0.13126491646778043,
"grad_norm": 0.2931169431343856,
"kl": 0.00011014938354492188,
"learning_rate": 9.738265855914012e-07,
"loss": 0.0965,
"reward": 1.5120384693145752,
"reward_std": 0.8640336990356445,
"rewards/": 5.941144943237305,
"rewards/math_compute_score": 0.4047619104385376,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 1727.4761962890625,
"epoch": 0.13365155131264916,
"grad_norm": 0.2751344219541302,
"kl": 0.00011014938354492188,
"learning_rate": 9.725770490155338e-07,
"loss": 0.0159,
"reward": 1.548958420753479,
"reward_std": 0.5042334794998169,
"rewards/": 6.792410850524902,
"rewards/math_compute_score": 0.2380952388048172,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 1785.3095703125,
"epoch": 0.1360381861575179,
"grad_norm": 0.24999827557080748,
"kl": 0.000125885009765625,
"learning_rate": 9.712992168898435e-07,
"loss": 0.0367,
"reward": 1.216183066368103,
"reward_std": 0.8457887768745422,
"rewards/": 5.985677242279053,
"rewards/math_compute_score": 0.02380952425301075,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 1770.2381591796875,
"epoch": 0.13842482100238662,
"grad_norm": 0.2367607232006906,
"kl": 0.00011157989501953125,
"learning_rate": 9.699931657245263e-07,
"loss": 0.0687,
"reward": 1.097646951675415,
"reward_std": 0.6355366706848145,
"rewards/": 5.012044429779053,
"rewards/math_compute_score": 0.1190476194024086,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 1794.21435546875,
"epoch": 0.14081145584725538,
"grad_norm": 0.2837311661067899,
"kl": 0.00015926361083984375,
"learning_rate": 9.686589737193928e-07,
"loss": 0.0467,
"reward": 1.027864694595337,
"reward_std": 0.7552880048751831,
"rewards/": 5.615513324737549,
"rewards/math_compute_score": -0.1190476194024086,
"step": 59
},
{
"epoch": 0.1431980906921241,
"grad_norm": 0.23325912751014408,
"learning_rate": 9.67296720759187e-07,
"loss": 0.0344,
"step": 60
},
{
"epoch": 0.1431980906921241,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1855.9643249511719,
"eval_kl": 0.0001424551010131836,
"eval_loss": 0.04132155328989029,
"eval_reward": 1.3015253245830536,
"eval_reward_std": 0.6968550086021423,
"eval_rewards/": 6.221912384033203,
"eval_rewards/math_compute_score": 0.07142857741564512,
"eval_runtime": 93.1523,
"eval_samples_per_second": 0.225,
"eval_steps_per_second": 0.011,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 1760.2500610351562,
"epoch": 0.14558472553699284,
"grad_norm": 0.24769237773421138,
"kl": 0.0001201629638671875,
"learning_rate": 9.659064884088016e-07,
"loss": 0.0936,
"reward": 1.3301293551921844,
"reward_std": 0.6551631987094879,
"rewards/": 6.1744561195373535,
"rewards/math_compute_score": 0.1190476268529892,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 1992.7857666015625,
"epoch": 0.14797136038186157,
"grad_norm": 0.2486362317475695,
"kl": 0.0001544952392578125,
"learning_rate": 9.644883599083957e-07,
"loss": 0.0196,
"reward": 0.8310267925262451,
"reward_std": 0.5494909882545471,
"rewards/": 6.250371932983398,
"rewards/math_compute_score": -0.523809552192688,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 1843.2857666015625,
"epoch": 0.15035799522673032,
"grad_norm": 0.2332848729776955,
"kl": 0.00014209747314453125,
"learning_rate": 9.630424201684103e-07,
"loss": -0.011,
"reward": 1.4165923595428467,
"reward_std": 0.44226470589637756,
"rewards/": 6.511532783508301,
"rewards/math_compute_score": 0.1428571492433548,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 1565.09521484375,
"epoch": 0.15274463007159905,
"grad_norm": 0.26464093358588886,
"kl": 0.000156402587890625,
"learning_rate": 9.615687557644848e-07,
"loss": 0.0191,
"reward": 1.9476191997528076,
"reward_std": 0.4103147089481354,
"rewards/": 6.690476417541504,
"rewards/math_compute_score": 0.761904776096344,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 1935.4285888671875,
"epoch": 0.15513126491646778,
"grad_norm": 0.2563941042729108,
"kl": 0.000148773193359375,
"learning_rate": 9.600674549322716e-07,
"loss": 0.0453,
"reward": 1.275520920753479,
"reward_std": 0.7737724781036377,
"rewards/": 6.758556842803955,
"rewards/math_compute_score": -0.095238097012043,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 1833.5,
"epoch": 0.1575178997613365,
"grad_norm": 0.2318537716691269,
"kl": 0.00012302398681640625,
"learning_rate": 9.585386075621552e-07,
"loss": 0.0385,
"reward": 1.5590215921401978,
"reward_std": 0.6652101278305054,
"rewards/": 6.461774826049805,
"rewards/math_compute_score": 0.3333333432674408,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 1752.4285888671875,
"epoch": 0.15990453460620524,
"grad_norm": 0.27198514000671786,
"kl": 0.00019741058349609375,
"learning_rate": 9.569823051938689e-07,
"loss": 0.09,
"reward": 1.275632381439209,
"reward_std": 0.663583517074585,
"rewards/": 5.901971817016602,
"rewards/math_compute_score": 0.1190476194024086,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 1832.6429443359375,
"epoch": 0.162291169451074,
"grad_norm": 0.2821458726503061,
"kl": 0.0002574920654296875,
"learning_rate": 9.553986410110134e-07,
"loss": 0.0101,
"reward": 1.6696429252624512,
"reward_std": 0.5702826380729675,
"rewards/": 6.824404716491699,
"rewards/math_compute_score": 0.380952388048172,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 2015.90478515625,
"epoch": 0.16467780429594273,
"grad_norm": 0.2419566650414704,
"kl": 0.00018024444580078125,
"learning_rate": 9.537877098354784e-07,
"loss": 0.0112,
"reward": 0.877566933631897,
"reward_std": 0.7397137880325317,
"rewards/": 6.19735860824585,
"rewards/math_compute_score": -0.4523809552192688,
"step": 69
},
{
"epoch": 0.16706443914081145,
"grad_norm": 0.22253088780698607,
"learning_rate": 9.52149608121765e-07,
"loss": 0.021,
"step": 70
},
{
"epoch": 0.16706443914081145,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1840.1190795898438,
"eval_kl": 0.00020503997802734375,
"eval_loss": 0.03286560997366905,
"eval_reward": 1.40643610060215,
"eval_reward_std": 0.6496933400630951,
"eval_rewards/": 6.246465802192688,
"eval_rewards/math_compute_score": 0.1964285671710968,
"eval_runtime": 92.7357,
"eval_samples_per_second": 0.226,
"eval_steps_per_second": 0.011,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 1853.15478515625,
"epoch": 0.16945107398568018,
"grad_norm": 0.20266438209975887,
"kl": 0.00015115737915039062,
"learning_rate": 9.504844339512094e-07,
"loss": -0.0027,
"reward": 1.1317429542541504,
"reward_std": 0.7229768335819244,
"rewards/": 6.230143308639526,
"rewards/math_compute_score": -0.1428571455180645,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 1757.857177734375,
"epoch": 0.1718377088305489,
"grad_norm": 0.2748053071046328,
"kl": 0.000278472900390625,
"learning_rate": 9.487922870261121e-07,
"loss": 0.0455,
"reward": 1.6033483743667603,
"reward_std": 0.7141416668891907,
"rewards/": 6.492931842803955,
"rewards/math_compute_score": 0.380952388048172,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 1690.3809814453125,
"epoch": 0.17422434367541767,
"grad_norm": 0.2990083388684613,
"kl": 0.00022125244140625,
"learning_rate": 9.470732686637664e-07,
"loss": 0.0544,
"reward": 1.4808036088943481,
"reward_std": 0.7631458044052124,
"rewards/": 6.546875,
"rewards/math_compute_score": 0.2142857164144516,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 1857.666748046875,
"epoch": 0.1766109785202864,
"grad_norm": 0.23969410631706858,
"kl": 0.0002002716064453125,
"learning_rate": 9.45327481790393e-07,
"loss": 0.0015,
"reward": 1.123772382736206,
"reward_std": 0.5604047179222107,
"rewards/": 6.476004600524902,
"rewards/math_compute_score": -0.2142857164144516,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 1976.3095703125,
"epoch": 0.17899761336515513,
"grad_norm": 0.24073302005555794,
"kl": 0.0002498626708984375,
"learning_rate": 9.435550309349776e-07,
"loss": 0.0242,
"reward": 1.314062476158142,
"reward_std": 0.7065877914428711,
"rewards/": 6.95126485824585,
"rewards/math_compute_score": -0.095238097012043,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 1773.8095703125,
"epoch": 0.18138424821002386,
"grad_norm": 0.29878792702659795,
"kl": 0.0004253387451171875,
"learning_rate": 9.417560222230114e-07,
"loss": 0.0681,
"reward": 1.7110120058059692,
"reward_std": 0.5763629674911499,
"rewards/": 7.221726417541504,
"rewards/math_compute_score": 0.3333333432674408,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 1844.3095703125,
"epoch": 0.18377088305489261,
"grad_norm": 0.2539852824553324,
"kl": 0.00025177001953125,
"learning_rate": 9.399305633701372e-07,
"loss": 0.0414,
"reward": 1.2555060386657715,
"reward_std": 0.6134779453277588,
"rewards/": 6.658482074737549,
"rewards/math_compute_score": -0.095238097012043,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 1619.6429443359375,
"epoch": 0.18615751789976134,
"grad_norm": 0.27715552033849417,
"kl": 0.0003719329833984375,
"learning_rate": 9.380787636757e-07,
"loss": 0.0674,
"reward": 1.380022406578064,
"reward_std": 0.4771695137023926,
"rewards/": 6.328683376312256,
"rewards/math_compute_score": 0.1428571492433548,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 1651.6190185546875,
"epoch": 0.18854415274463007,
"grad_norm": 0.2917968349033686,
"kl": 0.00030517578125,
"learning_rate": 9.362007340162028e-07,
"loss": 0.0588,
"reward": 1.4904018640518188,
"reward_std": 0.649864912033081,
"rewards/": 6.499628067016602,
"rewards/math_compute_score": 0.2380952388048172,
"step": 79
},
{
"epoch": 0.1909307875894988,
"grad_norm": 0.25391040017369215,
"learning_rate": 9.342965868386673e-07,
"loss": 0.0279,
"step": 80
},
{
"epoch": 0.1909307875894988,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1795.4226684570312,
"eval_kl": 0.0003237724304199219,
"eval_loss": 0.04981280118227005,
"eval_reward": 1.4364235252141953,
"eval_reward_std": 0.6483379453420639,
"eval_rewards/": 6.396403074264526,
"eval_rewards/math_compute_score": 0.1964285746216774,
"eval_runtime": 92.6366,
"eval_samples_per_second": 0.227,
"eval_steps_per_second": 0.011,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 1711.952392578125,
"epoch": 0.19331742243436753,
"grad_norm": 0.24040818501985536,
"kl": 0.000331878662109375,
"learning_rate": 9.323664361539018e-07,
"loss": 0.0842,
"reward": 1.5535017251968384,
"reward_std": 0.7349307537078857,
"rewards/": 6.338937044143677,
"rewards/math_compute_score": 0.3571428544819355,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 1713.6190185546875,
"epoch": 0.1957040572792363,
"grad_norm": 0.2780469140699165,
"kl": 0.00043487548828125,
"learning_rate": 9.304103975296748e-07,
"loss": 0.0477,
"reward": 1.3992561101913452,
"reward_std": 0.5194663405418396,
"rewards/": 6.424851417541504,
"rewards/math_compute_score": 0.1428571492433548,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 1488.40478515625,
"epoch": 0.19809069212410502,
"grad_norm": 0.33565417772392336,
"kl": 0.0004444122314453125,
"learning_rate": 9.284285880837946e-07,
"loss": 0.0969,
"reward": 1.6488840579986572,
"reward_std": 0.5082271099090576,
"rewards/": 6.149181842803955,
"rewards/math_compute_score": 0.523809552192688,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 1670.1905517578125,
"epoch": 0.20047732696897375,
"grad_norm": 0.24053030643983023,
"kl": 0.00046539306640625,
"learning_rate": 9.264211264770976e-07,
"loss": 0.027,
"reward": 1.6110121011734009,
"reward_std": 0.29187697172164917,
"rewards/": 6.91220235824585,
"rewards/math_compute_score": 0.2857142984867096,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 1544.047607421875,
"epoch": 0.20286396181384247,
"grad_norm": 0.25209602424956346,
"kl": 0.000377655029296875,
"learning_rate": 9.243881329063434e-07,
"loss": 0.0807,
"reward": 1.6919922828674316,
"reward_std": 0.6165055632591248,
"rewards/": 6.555199146270752,
"rewards/math_compute_score": 0.4761904776096344,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 1686.3095703125,
"epoch": 0.2052505966587112,
"grad_norm": 0.2538348128761773,
"kl": 0.0004749298095703125,
"learning_rate": 9.223297290970179e-07,
"loss": 0.0045,
"reward": 1.6205357313156128,
"reward_std": 0.791550874710083,
"rewards/": 6.578869342803955,
"rewards/math_compute_score": 0.380952388048172,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 1736.666748046875,
"epoch": 0.20763723150357996,
"grad_norm": 0.23799398657467383,
"kl": 0.000453948974609375,
"learning_rate": 9.202460382960447e-07,
"loss": 0.0187,
"reward": 1.296758770942688,
"reward_std": 0.758968710899353,
"rewards/": 5.912365436553955,
"rewards/math_compute_score": 0.1428571492433548,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 1815.7857666015625,
"epoch": 0.2100238663484487,
"grad_norm": 0.2548996775884122,
"kl": 0.0004329681396484375,
"learning_rate": 9.181371852644063e-07,
"loss": -0.0126,
"reward": 1.1803152561187744,
"reward_std": 0.7856223583221436,
"rewards/": 6.66348123550415,
"rewards/math_compute_score": -0.190476194024086,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 1961.047607421875,
"epoch": 0.21241050119331742,
"grad_norm": 0.23616170381807244,
"kl": 0.00052642822265625,
"learning_rate": 9.160032962696734e-07,
"loss": 0.0225,
"reward": 1.0959078073501587,
"reward_std": 0.7875651717185974,
"rewards/": 6.431919574737549,
"rewards/math_compute_score": -0.2380952388048172,
"step": 89
},
{
"epoch": 0.21479713603818615,
"grad_norm": 0.3028870348853355,
"learning_rate": 9.138444990784453e-07,
"loss": 0.0746,
"step": 90
},
{
"epoch": 0.21479713603818615,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1827.8512268066406,
"eval_kl": 0.0005598068237304688,
"eval_loss": 0.05406690388917923,
"eval_reward": 1.4557756930589676,
"eval_reward_std": 0.641949962824583,
"eval_rewards/": 6.326497554779053,
"eval_rewards/math_compute_score": 0.2380952462553978,
"eval_runtime": 97.7893,
"eval_samples_per_second": 0.215,
"eval_steps_per_second": 0.01,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 1750.9404907226562,
"epoch": 0.2171837708830549,
"grad_norm": 0.28025306135175526,
"kl": 0.000518798828125,
"learning_rate": 9.116609229486991e-07,
"loss": 0.0822,
"reward": 1.4702892303466797,
"reward_std": 0.848765641450882,
"rewards/": 6.399065256118774,
"rewards/math_compute_score": 0.2380952462553978,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 1697.2857666015625,
"epoch": 0.21957040572792363,
"grad_norm": 0.2620909431841143,
"kl": 0.000553131103515625,
"learning_rate": 9.094526986220512e-07,
"loss": -0.0109,
"reward": 1.3878443241119385,
"reward_std": 0.7288548946380615,
"rewards/": 6.177316188812256,
"rewards/math_compute_score": 0.190476194024086,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 1827.4285888671875,
"epoch": 0.22195704057279236,
"grad_norm": 0.2504018080609149,
"kl": 0.00055694580078125,
"learning_rate": 9.072199583159284e-07,
"loss": 0.0395,
"reward": 1.2157739400863647,
"reward_std": 0.6643213033676147,
"rewards/": 6.269345283508301,
"rewards/math_compute_score": -0.0476190485060215,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 1795.3095703125,
"epoch": 0.2243436754176611,
"grad_norm": 0.22970532451736303,
"kl": 0.0004730224609375,
"learning_rate": 9.04962835715652e-07,
"loss": 0.0283,
"reward": 1.2190290689468384,
"reward_std": 0.7647516131401062,
"rewards/": 6.285621166229248,
"rewards/math_compute_score": -0.0476190485060215,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 1790.047607421875,
"epoch": 0.22673031026252982,
"grad_norm": 0.21772408919059522,
"kl": 0.00058746337890625,
"learning_rate": 9.02681465966433e-07,
"loss": -0.021,
"reward": 1.447767972946167,
"reward_std": 0.6884434819221497,
"rewards/": 6.476934432983398,
"rewards/math_compute_score": 0.190476194024086,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 1935.3809814453125,
"epoch": 0.22911694510739858,
"grad_norm": 0.25618968897371547,
"kl": 0.000701904296875,
"learning_rate": 9.003759856652801e-07,
"loss": 0.0269,
"reward": 1.425186038017273,
"reward_std": 0.651878833770752,
"rewards/": 6.744977951049805,
"rewards/math_compute_score": 0.095238097012043,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 1660.0238037109375,
"epoch": 0.2315035799522673,
"grad_norm": 0.2838831684396231,
"kl": 0.000720977783203125,
"learning_rate": 8.980465328528218e-07,
"loss": 0.0505,
"reward": 1.8797248601913452,
"reward_std": 0.4036564528942108,
"rewards/": 7.2081475257873535,
"rewards/math_compute_score": 0.5476190447807312,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 1829.8333740234375,
"epoch": 0.23389021479713604,
"grad_norm": 0.22853372167994854,
"kl": 0.00064849853515625,
"learning_rate": 8.956932470050403e-07,
"loss": 0.0511,
"reward": 1.724107265472412,
"reward_std": 0.8355273604393005,
"rewards/": 6.52529764175415,
"rewards/math_compute_score": 0.523809552192688,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 1856.261962890625,
"epoch": 0.23627684964200477,
"grad_norm": 0.2512391392455568,
"kl": 0.00080108642578125,
"learning_rate": 8.933162690249208e-07,
"loss": 0.0423,
"reward": 1.442262053489685,
"reward_std": 0.709228515625,
"rewards/": 6.449404716491699,
"rewards/math_compute_score": 0.190476194024086,
"step": 99
},
{
"epoch": 0.2386634844868735,
"grad_norm": 0.26580287893671717,
"learning_rate": 8.909157412340149e-07,
"loss": 0.0522,
"step": 100
},
{
"epoch": 0.2386634844868735,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1771.4226684570312,
"eval_kl": 0.0007572174072265625,
"eval_loss": 0.04274141788482666,
"eval_reward": 1.5543109476566315,
"eval_reward_std": 0.5933023318648338,
"eval_rewards/": 6.509649395942688,
"eval_rewards/math_compute_score": 0.315476194024086,
"eval_runtime": 91.7292,
"eval_samples_per_second": 0.229,
"eval_steps_per_second": 0.011,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 1773.857177734375,
"epoch": 0.24105011933174225,
"grad_norm": 0.2651946749721896,
"kl": 0.0009403228759765625,
"learning_rate": 8.884918073639189e-07,
"loss": 0.0451,
"reward": 1.227901816368103,
"reward_std": 0.608531042933464,
"rewards/": 5.949032783508301,
"rewards/math_compute_score": 0.0476190485060215,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 1692.7381591796875,
"epoch": 0.24343675417661098,
"grad_norm": 0.25923640399562387,
"kl": 0.00080108642578125,
"learning_rate": 8.860446125476686e-07,
"loss": 0.0513,
"reward": 1.4982887506484985,
"reward_std": 0.3660634756088257,
"rewards/": 6.634300708770752,
"rewards/math_compute_score": 0.2142857164144516,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 1747.0,
"epoch": 0.2458233890214797,
"grad_norm": 0.25927402841933767,
"kl": 0.000736236572265625,
"learning_rate": 8.835743033110482e-07,
"loss": 0.0204,
"reward": 1.127715826034546,
"reward_std": 0.7056443691253662,
"rewards/": 6.40048360824585,
"rewards/math_compute_score": -0.190476194024086,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 1954.90478515625,
"epoch": 0.24821002386634844,
"grad_norm": 0.23099992586235157,
"kl": 0.0007476806640625,
"learning_rate": 8.810810275638182e-07,
"loss": 0.0478,
"reward": 1.1020090579986572,
"reward_std": 0.8382893800735474,
"rewards/": 6.462425708770752,
"rewards/math_compute_score": -0.2380952388048172,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 1456.952392578125,
"epoch": 0.25059665871121717,
"grad_norm": 0.2925044722785809,
"kl": 0.000957489013671875,
"learning_rate": 8.785649345908587e-07,
"loss": 0.0075,
"reward": 1.6823569536209106,
"reward_std": 0.5539329648017883,
"rewards/": 6.3165459632873535,
"rewards/math_compute_score": 0.523809552192688,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 1802.4761962890625,
"epoch": 0.2529832935560859,
"grad_norm": 0.25996955962455726,
"kl": 0.000904083251953125,
"learning_rate": 8.760261750432312e-07,
"loss": 0.0588,
"reward": 1.4761160612106323,
"reward_std": 0.5810412764549255,
"rewards/": 6.618675708770752,
"rewards/math_compute_score": 0.190476194024086,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 1391.2381591796875,
"epoch": 0.2553699284009546,
"grad_norm": 0.2857669697125554,
"kl": 0.00119781494140625,
"learning_rate": 8.734649009291583e-07,
"loss": 0.0018,
"reward": 1.8566220998764038,
"reward_std": 0.41003191471099854,
"rewards/": 6.2354912757873535,
"rewards/math_compute_score": 0.761904776096344,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 1657.09521484375,
"epoch": 0.2577565632458234,
"grad_norm": 0.2581521551227329,
"kl": 0.000820159912109375,
"learning_rate": 8.708812656049225e-07,
"loss": 0.0116,
"reward": 1.7328126430511475,
"reward_std": 0.5485076308250427,
"rewards/": 6.28311014175415,
"rewards/math_compute_score": 0.5952380895614624,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 1695.3333740234375,
"epoch": 0.26014319809069214,
"grad_norm": 0.22118118373246637,
"kl": 0.000804901123046875,
"learning_rate": 8.68275423765683e-07,
"loss": -0.0055,
"reward": 1.378050684928894,
"reward_std": 0.5073475241661072,
"rewards/": 6.699777126312256,
"rewards/math_compute_score": 0.0476190485060215,
"step": 109
},
{
"epoch": 0.26252983293556087,
"grad_norm": 0.23367045175580944,
"learning_rate": 8.656475314362147e-07,
"loss": 0.0098,
"step": 110
},
{
"epoch": 0.26252983293556087,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1761.2619323730469,
"eval_kl": 0.0010433197021484375,
"eval_loss": 0.041358206421136856,
"eval_reward": 1.4955194890499115,
"eval_reward_std": 0.6651220917701721,
"eval_rewards/": 6.382359266281128,
"eval_rewards/math_compute_score": 0.2738095261156559,
"eval_runtime": 91.2376,
"eval_samples_per_second": 0.23,
"eval_steps_per_second": 0.011,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 1746.5119018554688,
"epoch": 0.2649164677804296,
"grad_norm": 0.22778632128851484,
"kl": 0.0009899139404296875,
"learning_rate": 8.629977459615654e-07,
"loss": -0.0063,
"reward": 1.8497769236564636,
"reward_std": 0.44625431299209595,
"rewards/": 7.010788679122925,
"rewards/math_compute_score": 0.5595238134264946,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 1869.4285888671875,
"epoch": 0.26730310262529833,
"grad_norm": 0.22506830297475983,
"kl": 0.000957489013671875,
"learning_rate": 8.603262259976348e-07,
"loss": 0.0298,
"reward": 1.6217262744903564,
"reward_std": 0.5545719265937805,
"rewards/": 7.15625,
"rewards/math_compute_score": 0.2380952388048172,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 1583.8333740234375,
"epoch": 0.26968973747016706,
"grad_norm": 0.26335556959246714,
"kl": 0.001190185546875,
"learning_rate": 8.576331315016751e-07,
"loss": 0.074,
"reward": 1.390029788017273,
"reward_std": 0.7560327053070068,
"rewards/": 6.283482074737549,
"rewards/math_compute_score": 0.1666666716337204,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 1648.7857666015625,
"epoch": 0.2720763723150358,
"grad_norm": 0.24404060061496832,
"kl": 0.001220703125,
"learning_rate": 8.549186237227138e-07,
"loss": -0.0057,
"reward": 1.8864582777023315,
"reward_std": 0.5348789691925049,
"rewards/": 7.051339626312256,
"rewards/math_compute_score": 0.5952380895614624,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 1813.2381591796875,
"epoch": 0.2744630071599045,
"grad_norm": 0.29199548341678777,
"kl": 0.00124359130859375,
"learning_rate": 8.52182865191898e-07,
"loss": 0.0717,
"reward": 1.410640001296997,
"reward_std": 0.8225697875022888,
"rewards/": 6.481770992279053,
"rewards/math_compute_score": 0.1428571492433548,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 1860.6905517578125,
"epoch": 0.27684964200477324,
"grad_norm": 0.23564871797794462,
"kl": 0.0011138916015625,
"learning_rate": 8.494260197127648e-07,
"loss": -0.0066,
"reward": 1.5276786088943481,
"reward_std": 0.48343804478645325,
"rewards/": 7.066964626312256,
"rewards/math_compute_score": 0.1428571492433548,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 1599.0714111328125,
"epoch": 0.27923627684964203,
"grad_norm": 0.2704337659468572,
"kl": 0.00121307373046875,
"learning_rate": 8.466482523514309e-07,
"loss": 0.0398,
"reward": 1.6256511211395264,
"reward_std": 0.7011668682098389,
"rewards/": 6.699683666229248,
"rewards/math_compute_score": 0.3571428656578064,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 1592.857177734375,
"epoch": 0.28162291169451076,
"grad_norm": 0.2476126055211145,
"kl": 0.00128936767578125,
"learning_rate": 8.438497294267116e-07,
"loss": 0.0137,
"reward": 2.154017925262451,
"reward_std": 0.3840785324573517,
"rewards/": 7.531994342803955,
"rewards/math_compute_score": 0.8095238208770752,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 1497.952392578125,
"epoch": 0.2840095465393795,
"grad_norm": 0.2830025386276568,
"kl": 0.00151824951171875,
"learning_rate": 8.41030618500161e-07,
"loss": 0.0483,
"reward": 1.8147321939468384,
"reward_std": 0.5045832395553589,
"rewards/": 7.359375,
"rewards/math_compute_score": 0.4285714328289032,
"step": 119
},
{
"epoch": 0.2863961813842482,
"grad_norm": 0.22278442425352968,
"learning_rate": 8.381910883660399e-07,
"loss": 0.0265,
"step": 120
},
{
"epoch": 0.2863961813842482,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1761.5595397949219,
"eval_kl": 0.0013446807861328125,
"eval_loss": 0.06586353480815887,
"eval_reward": 1.6639322936534882,
"eval_reward_std": 0.5864010900259018,
"eval_rewards/": 6.629185318946838,
"eval_rewards/math_compute_score": 0.42261905781924725,
"eval_runtime": 91.3472,
"eval_samples_per_second": 0.23,
"eval_steps_per_second": 0.011,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 1705.65478515625,
"epoch": 0.28878281622911695,
"grad_norm": 0.2508833756487164,
"kl": 0.00110626220703125,
"learning_rate": 8.353313090412091e-07,
"loss": 0.0286,
"reward": 1.5229679942131042,
"reward_std": 0.39858949184417725,
"rewards/": 6.662458419799805,
"rewards/math_compute_score": 0.238095223903656,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 1615.547607421875,
"epoch": 0.2911694510739857,
"grad_norm": 0.2961002801426825,
"kl": 0.0015106201171875,
"learning_rate": 8.3245145175495e-07,
"loss": 0.0157,
"reward": 1.5044642686843872,
"reward_std": 0.36025503277778625,
"rewards/": 6.569940567016602,
"rewards/math_compute_score": 0.2380952388048172,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 1715.0714111328125,
"epoch": 0.2935560859188544,
"grad_norm": 0.2873657925294743,
"kl": 0.0014190673828125,
"learning_rate": 8.295516889387114e-07,
"loss": 0.0686,
"reward": 1.341320276260376,
"reward_std": 0.5270970463752747,
"rewards/": 6.420886993408203,
"rewards/math_compute_score": 0.0714285746216774,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 1663.047607421875,
"epoch": 0.29594272076372313,
"grad_norm": 0.32394764493704503,
"kl": 0.00144195556640625,
"learning_rate": 8.266321942157859e-07,
"loss": 0.0717,
"reward": 1.2409132719039917,
"reward_std": 0.8710536360740662,
"rewards/": 6.204566478729248,
"rewards/math_compute_score": 0.0,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 1718.40478515625,
"epoch": 0.29832935560859186,
"grad_norm": 0.26261921476130473,
"kl": 0.0014495849609375,
"learning_rate": 8.236931423909138e-07,
"loss": 0.0276,
"reward": 1.4648065567016602,
"reward_std": 0.8197119832038879,
"rewards/": 6.181175708770752,
"rewards/math_compute_score": 0.2857142984867096,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 1603.666748046875,
"epoch": 0.30071599045346065,
"grad_norm": 0.2761289543593296,
"kl": 0.00135040283203125,
"learning_rate": 8.207347094398171e-07,
"loss": 0.0157,
"reward": 1.2167319059371948,
"reward_std": 0.5766604542732239,
"rewards/": 5.893182754516602,
"rewards/math_compute_score": 0.0476190485060215,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 1712.166748046875,
"epoch": 0.3031026252983294,
"grad_norm": 0.3314518691922463,
"kl": 0.001708984375,
"learning_rate": 8.177570724986626e-07,
"loss": 0.0594,
"reward": 1.0614583492279053,
"reward_std": 0.4964538514614105,
"rewards/": 5.783482074737549,
"rewards/math_compute_score": -0.1190476194024086,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 1777.59521484375,
"epoch": 0.3054892601431981,
"grad_norm": 0.23838654656179914,
"kl": 0.00142669677734375,
"learning_rate": 8.14760409853456e-07,
"loss": -0.001,
"reward": 1.4808967113494873,
"reward_std": 0.532819926738739,
"rewards/": 6.833054542541504,
"rewards/math_compute_score": 0.1428571492433548,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 1779.09521484375,
"epoch": 0.30787589498806683,
"grad_norm": 0.2415045050319174,
"kl": 0.00141143798828125,
"learning_rate": 8.117449009293668e-07,
"loss": 0.0146,
"reward": 1.3048317432403564,
"reward_std": 0.6548949480056763,
"rewards/": 6.524158477783203,
"rewards/math_compute_score": 0.0,
"step": 129
},
{
"epoch": 0.31026252983293556,
"grad_norm": 0.22090091602582893,
"learning_rate": 8.087107262799855e-07,
"loss": -0.051,
"step": 130
},
{
"epoch": 0.31026252983293556,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1763.0952758789062,
"eval_kl": 0.001575469970703125,
"eval_loss": 0.01967952772974968,
"eval_reward": 1.662981390953064,
"eval_reward_std": 0.5752230435609818,
"eval_rewards/": 6.529192328453064,
"eval_rewards/math_compute_score": 0.4464285746216774,
"eval_runtime": 98.0654,
"eval_samples_per_second": 0.214,
"eval_steps_per_second": 0.01,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 1817.3452758789062,
"epoch": 0.3126491646778043,
"grad_norm": 0.2602875367873934,
"kl": 0.00157928466796875,
"learning_rate": 8.056580675765129e-07,
"loss": 0.0341,
"reward": 1.412472128868103,
"reward_std": 0.7119008004665375,
"rewards/": 6.871884346008301,
"rewards/math_compute_score": 0.04761905036866665,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 1185.5238037109375,
"epoch": 0.315035799522673,
"grad_norm": 0.2738706058516209,
"kl": 0.00148773193359375,
"learning_rate": 8.025871075968826e-07,
"loss": 0.0178,
"reward": 1.7256139516830444,
"reward_std": 0.424249529838562,
"rewards/": 5.866164684295654,
"rewards/math_compute_score": 0.6904761791229248,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 1664.547607421875,
"epoch": 0.31742243436754175,
"grad_norm": 0.2672055217932093,
"kl": 0.00159454345703125,
"learning_rate": 7.994980302148169e-07,
"loss": 0.0251,
"reward": 1.4857888221740723,
"reward_std": 0.5646493434906006,
"rewards/": 7.0479912757873535,
"rewards/math_compute_score": 0.095238097012043,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 1582.1190185546875,
"epoch": 0.3198090692124105,
"grad_norm": 0.23074518957473036,
"kl": 0.0013427734375,
"learning_rate": 7.963910203888176e-07,
"loss": 0.0051,
"reward": 1.954390048980713,
"reward_std": 0.568084180355072,
"rewards/": 6.914806842803955,
"rewards/math_compute_score": 0.7142857313156128,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 1632.4285888671875,
"epoch": 0.3221957040572792,
"grad_norm": 0.2984688074510682,
"kl": 0.002105712890625,
"learning_rate": 7.932662641510914e-07,
"loss": 0.0074,
"reward": 1.4273810386657715,
"reward_std": 0.2996509373188019,
"rewards/": 6.470238208770752,
"rewards/math_compute_score": 0.1666666716337204,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 1608.90478515625,
"epoch": 0.324582338902148,
"grad_norm": 0.2744118083019373,
"kl": 0.001983642578125,
"learning_rate": 7.90123948596412e-07,
"loss": 0.0955,
"reward": 1.621465802192688,
"reward_std": 0.7447255849838257,
"rewards/": 6.583519458770752,
"rewards/math_compute_score": 0.380952388048172,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 1611.0238037109375,
"epoch": 0.3269689737470167,
"grad_norm": 0.2625294125709787,
"kl": 0.001495361328125,
"learning_rate": 7.86964261870916e-07,
"loss": 0.0211,
"reward": 1.5608538389205933,
"reward_std": 0.679740309715271,
"rewards/": 6.566174030303955,
"rewards/math_compute_score": 0.3095238208770752,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 1599.9285888671875,
"epoch": 0.32935560859188545,
"grad_norm": 0.3038466252946791,
"kl": 0.00173187255859375,
"learning_rate": 7.837873931608399e-07,
"loss": 0.0589,
"reward": 1.10877525806427,
"reward_std": 0.8366554975509644,
"rewards/": 5.44863748550415,
"rewards/math_compute_score": 0.02380952425301075,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 1703.666748046875,
"epoch": 0.3317422434367542,
"grad_norm": 0.25300249483455434,
"kl": 0.0018157958984375,
"learning_rate": 7.805935326811912e-07,
"loss": 0.0349,
"reward": 1.8174108266830444,
"reward_std": 0.490249365568161,
"rewards/": 6.610863208770752,
"rewards/math_compute_score": 0.6190476417541504,
"step": 139
},
{
"epoch": 0.3341288782816229,
"grad_norm": 0.2417853507095786,
"learning_rate": 7.773828716643592e-07,
"loss": 0.018,
"step": 140
},
{
"epoch": 0.3341288782816229,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1762.2500305175781,
"eval_kl": 0.0018558502197265625,
"eval_loss": 0.025447282940149307,
"eval_reward": 1.6102934777736664,
"eval_reward_std": 0.6778208911418915,
"eval_rewards/": 6.456229090690613,
"eval_rewards/math_compute_score": 0.3988095265813172,
"eval_runtime": 91.0536,
"eval_samples_per_second": 0.231,
"eval_steps_per_second": 0.011,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 1680.9880981445312,
"epoch": 0.33651551312649164,
"grad_norm": 0.27816646041180015,
"kl": 0.001827239990234375,
"learning_rate": 7.741556023486654e-07,
"loss": 0.0594,
"reward": 1.5625837445259094,
"reward_std": 0.5163165330886841,
"rewards/": 6.6224424839019775,
"rewards/math_compute_score": 0.2976190522313118,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 1580.6190185546875,
"epoch": 0.33890214797136037,
"grad_norm": 0.2833770658693713,
"kl": 0.0019989013671875,
"learning_rate": 7.709119179668537e-07,
"loss": 0.0394,
"reward": 1.7200149297714233,
"reward_std": 0.4698960483074188,
"rewards/": 6.885788917541504,
"rewards/math_compute_score": 0.4285714328289032,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 1628.1905517578125,
"epoch": 0.3412887828162291,
"grad_norm": 0.2536982306965292,
"kl": 0.00177764892578125,
"learning_rate": 7.676520127345196e-07,
"loss": 0.0159,
"reward": 1.4838913679122925,
"reward_std": 0.6197465658187866,
"rewards/": 6.562314033508301,
"rewards/math_compute_score": 0.2142857164144516,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 1552.3809814453125,
"epoch": 0.3436754176610978,
"grad_norm": 0.31347368028504985,
"kl": 0.00180816650390625,
"learning_rate": 7.643760818384819e-07,
"loss": 0.087,
"reward": 1.6655505895614624,
"reward_std": 0.6516547799110413,
"rewards/": 6.042038917541504,
"rewards/math_compute_score": 0.5714285969734192,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 1977.2857666015625,
"epoch": 0.3460620525059666,
"grad_norm": 0.25241631041768503,
"kl": 0.00189208984375,
"learning_rate": 7.610843214250964e-07,
"loss": 0.0293,
"reward": 1.0822917222976685,
"reward_std": 0.6940091252326965,
"rewards/": 5.982886791229248,
"rewards/math_compute_score": -0.1428571492433548,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 1490.0238037109375,
"epoch": 0.34844868735083534,
"grad_norm": 0.2982197191520602,
"kl": 0.0022430419921875,
"learning_rate": 7.577769285885108e-07,
"loss": 0.0155,
"reward": 1.8208333253860474,
"reward_std": 0.4490146338939667,
"rewards/": 6.913690567016602,
"rewards/math_compute_score": 0.5476190447807312,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 1673.857177734375,
"epoch": 0.35083532219570407,
"grad_norm": 0.278252727347962,
"kl": 0.002197265625,
"learning_rate": 7.544541013588644e-07,
"loss": 0.0415,
"reward": 1.8191593885421753,
"reward_std": 0.6377332210540771,
"rewards/": 7.2862725257873535,
"rewards/math_compute_score": 0.4523809552192688,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 1316.261962890625,
"epoch": 0.3532219570405728,
"grad_norm": 0.28120546569751265,
"kl": 0.0024871826171875,
"learning_rate": 7.511160386904305e-07,
"loss": -0.0168,
"reward": 2.0831844806671143,
"reward_std": 0.187238872051239,
"rewards/": 7.749256134033203,
"rewards/math_compute_score": 0.6666666865348816,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 1709.547607421875,
"epoch": 0.3556085918854415,
"grad_norm": 0.23539062329651342,
"kl": 0.001708984375,
"learning_rate": 7.477629404497047e-07,
"loss": 0.0365,
"reward": 1.254538655281067,
"reward_std": 0.7538212537765503,
"rewards/": 6.177455425262451,
"rewards/math_compute_score": 0.02380952425301075,
"step": 149
},
{
"epoch": 0.35799522673031026,
"grad_norm": 0.24520940371651193,
"learning_rate": 7.443950074034367e-07,
"loss": 0.0149,
"step": 150
},
{
"epoch": 0.35799522673031026,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1737.2857360839844,
"eval_kl": 0.0021305084228515625,
"eval_loss": 0.042245958000421524,
"eval_reward": 1.562076896429062,
"eval_reward_std": 0.6442549824714661,
"eval_rewards/": 6.334193706512451,
"eval_rewards/math_compute_score": 0.36904762499034405,
"eval_runtime": 90.9442,
"eval_samples_per_second": 0.231,
"eval_steps_per_second": 0.011,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 1729.0357055664062,
"epoch": 0.360381861575179,
"grad_norm": 0.26927693948058945,
"kl": 0.0023193359375,
"learning_rate": 7.41012441206611e-07,
"loss": -0.0032,
"reward": 1.4532668590545654,
"reward_std": 0.5979789793491364,
"rewards/": 6.504429578781128,
"rewards/math_compute_score": 0.19047619495540857,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 1618.3333740234375,
"epoch": 0.3627684964200477,
"grad_norm": 0.27430715469286404,
"kl": 0.002349853515625,
"learning_rate": 7.376154443903713e-07,
"loss": 0.0893,
"reward": 1.5656062364578247,
"reward_std": 0.6696694493293762,
"rewards/": 6.7804131507873535,
"rewards/math_compute_score": 0.261904776096344,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 1713.1429443359375,
"epoch": 0.36515513126491644,
"grad_norm": 0.24192656702948537,
"kl": 0.00191497802734375,
"learning_rate": 7.342042203498951e-07,
"loss": 0.0093,
"reward": 1.3910435438156128,
"reward_std": 0.8083306550979614,
"rewards/": 6.574265480041504,
"rewards/math_compute_score": 0.095238097012043,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 1752.547607421875,
"epoch": 0.36754176610978523,
"grad_norm": 0.23529364280462375,
"kl": 0.00177764892578125,
"learning_rate": 7.307789733322145e-07,
"loss": 0.0412,
"reward": 1.2998976707458496,
"reward_std": 0.6855893135070801,
"rewards/": 6.499488353729248,
"rewards/math_compute_score": 0.0,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 1709.09521484375,
"epoch": 0.36992840095465396,
"grad_norm": 0.2565888286348459,
"kl": 0.0020294189453125,
"learning_rate": 7.273399084239878e-07,
"loss": 0.0343,
"reward": 1.2922619581222534,
"reward_std": 0.5724942684173584,
"rewards/": 6.842262268066406,
"rewards/math_compute_score": -0.095238097012043,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 1770.0238037109375,
"epoch": 0.3723150357995227,
"grad_norm": 0.2556634342782622,
"kl": 0.0023651123046875,
"learning_rate": 7.238872315392189e-07,
"loss": 0.0484,
"reward": 1.8619048595428467,
"reward_std": 0.4541710913181305,
"rewards/": 7.214285850524902,
"rewards/math_compute_score": 0.523809552192688,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 1747.21435546875,
"epoch": 0.3747016706443914,
"grad_norm": 0.23922842509554956,
"kl": 0.00201416015625,
"learning_rate": 7.204211494069291e-07,
"loss": -0.0377,
"reward": 1.7935267686843872,
"reward_std": 0.5942096710205078,
"rewards/": 7.062871932983398,
"rewards/math_compute_score": 0.4761904776096344,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 1375.4285888671875,
"epoch": 0.37708830548926014,
"grad_norm": 0.2504129627401833,
"kl": 0.002655029296875,
"learning_rate": 7.16941869558779e-07,
"loss": 0.0313,
"reward": 2.2074406147003174,
"reward_std": 0.2237553596496582,
"rewards/": 7.418154716491699,
"rewards/math_compute_score": 0.9047619104385376,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 1834.4761962890625,
"epoch": 0.3794749403341289,
"grad_norm": 0.22770506606151802,
"kl": 0.00201416015625,
"learning_rate": 7.134496003166423e-07,
"loss": 0.0172,
"reward": 1.2007441520690918,
"reward_std": 0.6796280741691589,
"rewards/": 6.194196701049805,
"rewards/math_compute_score": -0.0476190485060215,
"step": 159
},
{
"epoch": 0.3818615751789976,
"grad_norm": 0.27823803082361936,
"learning_rate": 7.099445507801323e-07,
"loss": 0.0394,
"step": 160
},
{
"epoch": 0.3818615751789976,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1710.9940795898438,
"eval_kl": 0.002544403076171875,
"eval_loss": 0.03967365622520447,
"eval_reward": 1.7245815098285675,
"eval_reward_std": 0.6458448991179466,
"eval_rewards/": 6.646717071533203,
"eval_rewards/math_compute_score": 0.4940476305782795,
"eval_runtime": 90.5014,
"eval_samples_per_second": 0.232,
"eval_steps_per_second": 0.011,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 1524.2262573242188,
"epoch": 0.38424821002386633,
"grad_norm": 0.28852725034404547,
"kl": 0.0025177001953125,
"learning_rate": 7.064269308140829e-07,
"loss": 0.0211,
"reward": 1.7621653079986572,
"reward_std": 0.4823741465806961,
"rewards/": 7.048921346664429,
"rewards/math_compute_score": 0.4404761865735054,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 1638.952392578125,
"epoch": 0.38663484486873506,
"grad_norm": 0.25636209494281637,
"kl": 0.002410888671875,
"learning_rate": 7.02896951035982e-07,
"loss": 0.044,
"reward": 1.4277158975601196,
"reward_std": 0.5193288922309875,
"rewards/": 6.852864742279053,
"rewards/math_compute_score": 0.0714285746216774,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 1781.119140625,
"epoch": 0.38902147971360385,
"grad_norm": 0.217013334112183,
"kl": 0.00238037109375,
"learning_rate": 6.993548228033617e-07,
"loss": 0.0355,
"reward": 1.5361608266830444,
"reward_std": 0.46594858169555664,
"rewards/": 6.347470283508301,
"rewards/math_compute_score": 0.3333333432674408,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 1553.2381591796875,
"epoch": 0.3914081145584726,
"grad_norm": 0.24395443915530762,
"kl": 0.00225830078125,
"learning_rate": 6.958007582011424e-07,
"loss": 0.0249,
"reward": 2.022023916244507,
"reward_std": 0.3801310360431671,
"rewards/": 7.6339287757873535,
"rewards/math_compute_score": 0.6190476417541504,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 1617.8095703125,
"epoch": 0.3937947494033413,
"grad_norm": 0.24004403569382207,
"kl": 0.0025787353515625,
"learning_rate": 6.922349700289347e-07,
"loss": 0.0256,
"reward": 1.8187501430511475,
"reward_std": 0.47890540957450867,
"rewards/": 6.998512268066406,
"rewards/math_compute_score": 0.523809552192688,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 1533.261962890625,
"epoch": 0.39618138424821003,
"grad_norm": 0.23219253681724725,
"kl": 0.002532958984375,
"learning_rate": 6.886576717882981e-07,
"loss": -0.0059,
"reward": 1.9263392686843872,
"reward_std": 0.4234418570995331,
"rewards/": 6.8697919845581055,
"rewards/math_compute_score": 0.6904761791229248,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 1722.21435546875,
"epoch": 0.39856801909307876,
"grad_norm": 0.2695428246946182,
"kl": 0.0027008056640625,
"learning_rate": 6.850690776699573e-07,
"loss": -0.002,
"reward": 1.3836426734924316,
"reward_std": 0.5190478563308716,
"rewards/": 6.346784591674805,
"rewards/math_compute_score": 0.1428571492433548,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 1703.90478515625,
"epoch": 0.4009546539379475,
"grad_norm": 0.2480850316910639,
"kl": 0.0023040771484375,
"learning_rate": 6.814694025409773e-07,
"loss": 0.0089,
"reward": 1.8032739162445068,
"reward_std": 0.4905838966369629,
"rewards/": 7.111607074737549,
"rewards/math_compute_score": 0.4761904776096344,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 1487.3333740234375,
"epoch": 0.4033412887828162,
"grad_norm": 0.2441614041062561,
"kl": 0.002593994140625,
"learning_rate": 6.778588619318993e-07,
"loss": 0.0052,
"reward": 1.5892950296401978,
"reward_std": 0.679408609867096,
"rewards/": 5.8512372970581055,
"rewards/math_compute_score": 0.523809552192688,
"step": 169
},
{
"epoch": 0.40572792362768495,
"grad_norm": 0.28274768118904486,
"learning_rate": 6.742376720238346e-07,
"loss": -0.0091,
"step": 170
},
{
"epoch": 0.40572792362768495,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1709.7024230957031,
"eval_kl": 0.0027008056640625,
"eval_loss": 0.03433432802557945,
"eval_reward": 1.67239710688591,
"eval_reward_std": 0.5494325160980225,
"eval_rewards/": 6.457223296165466,
"eval_rewards/math_compute_score": 0.47619048599153757,
"eval_runtime": 90.002,
"eval_samples_per_second": 0.233,
"eval_steps_per_second": 0.011,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 1721.3690795898438,
"epoch": 0.4081145584725537,
"grad_norm": 0.2507835886818597,
"kl": 0.00286865234375,
"learning_rate": 6.706060496355211e-07,
"loss": 0.0396,
"reward": 1.640829622745514,
"reward_std": 0.6370173096656799,
"rewards/": 6.680338621139526,
"rewards/math_compute_score": 0.380952388048172,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 1680.547607421875,
"epoch": 0.4105011933174224,
"grad_norm": 0.24695563782362182,
"kl": 0.0029449462890625,
"learning_rate": 6.669642122103422e-07,
"loss": 0.0106,
"reward": 1.5374256372451782,
"reward_std": 0.48627233505249023,
"rewards/": 6.734746932983398,
"rewards/math_compute_score": 0.2380952388048172,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 1830.261962890625,
"epoch": 0.4128878281622912,
"grad_norm": 0.27668091559057234,
"kl": 0.0030975341796875,
"learning_rate": 6.633123778033061e-07,
"loss": 0.0415,
"reward": 1.9467262029647827,
"reward_std": 0.5212621092796326,
"rewards/": 6.876488208770752,
"rewards/math_compute_score": 0.7142857313156128,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 1621.71435546875,
"epoch": 0.4152744630071599,
"grad_norm": 0.27868551362271904,
"kl": 0.0023345947265625,
"learning_rate": 6.596507650679899e-07,
"loss": 0.0289,
"reward": 1.2761160135269165,
"reward_std": 0.6406970620155334,
"rewards/": 6.380580425262451,
"rewards/math_compute_score": 0.0,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 1595.8095703125,
"epoch": 0.41766109785202865,
"grad_norm": 0.2972079507840572,
"kl": 0.0031585693359375,
"learning_rate": 6.559795932434488e-07,
"loss": 0.0709,
"reward": 1.8078126907348633,
"reward_std": 0.5548811554908752,
"rewards/": 6.943824768066406,
"rewards/math_compute_score": 0.523809552192688,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 1457.666748046875,
"epoch": 0.4200477326968974,
"grad_norm": 0.2279280487796575,
"kl": 0.0028076171875,
"learning_rate": 6.52299082141088e-07,
"loss": -0.0355,
"reward": 2.040308952331543,
"reward_std": 0.5329591631889343,
"rewards/": 7.05868673324585,
"rewards/math_compute_score": 0.785714328289032,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 1632.261962890625,
"epoch": 0.4224343675417661,
"grad_norm": 0.24847601288466012,
"kl": 0.0029449462890625,
"learning_rate": 6.486094521315021e-07,
"loss": 0.0605,
"reward": 1.9953869581222534,
"reward_std": 0.5443364381790161,
"rewards/": 7.1197919845581055,
"rewards/math_compute_score": 0.7142857313156128,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 1861.4285888671875,
"epoch": 0.42482100238663484,
"grad_norm": 0.22918857863503886,
"kl": 0.002655029296875,
"learning_rate": 6.449109241312802e-07,
"loss": 0.003,
"reward": 1.405282735824585,
"reward_std": 0.6072686314582825,
"rewards/": 6.074032783508301,
"rewards/math_compute_score": 0.2380952388048172,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 1707.8809814453125,
"epoch": 0.42720763723150357,
"grad_norm": 0.2734274914799027,
"kl": 0.002685546875,
"learning_rate": 6.412037195897785e-07,
"loss": 0.0409,
"reward": 1.6891371011734009,
"reward_std": 0.5659449100494385,
"rewards/": 7.5885419845581055,
"rewards/math_compute_score": 0.2142857164144516,
"step": 179
},
{
"epoch": 0.4295942720763723,
"grad_norm": 0.25298589420589623,
"learning_rate": 6.374880604758614e-07,
"loss": -0.0146,
"step": 180
},
{
"epoch": 0.4295942720763723,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1732.7916870117188,
"eval_kl": 0.002941131591796875,
"eval_loss": 0.04335374757647514,
"eval_reward": 1.724925696849823,
"eval_reward_std": 0.5939712524414062,
"eval_rewards/": 6.6722471714019775,
"eval_rewards/math_compute_score": 0.4880952462553978,
"eval_runtime": 90.8058,
"eval_samples_per_second": 0.231,
"eval_steps_per_second": 0.011,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 1528.4166870117188,
"epoch": 0.431980906921241,
"grad_norm": 0.31782017197338924,
"kl": 0.0032196044921875,
"learning_rate": 6.337641692646106e-07,
"loss": 0.0545,
"reward": 1.7690011262893677,
"reward_std": 0.47228382527828217,
"rewards/": 7.130719900131226,
"rewards/math_compute_score": 0.4285714477300644,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 1502.8095703125,
"epoch": 0.4343675417661098,
"grad_norm": 0.26689710039208736,
"kl": 0.003662109375,
"learning_rate": 6.300322689240041e-07,
"loss": 0.0446,
"reward": 1.989508867263794,
"reward_std": 0.41250666975975037,
"rewards/": 6.518973350524902,
"rewards/math_compute_score": 0.8571428656578064,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 1575.452392578125,
"epoch": 0.43675417661097854,
"grad_norm": 0.27786153846351275,
"kl": 0.003875732421875,
"learning_rate": 6.262925829015675e-07,
"loss": -0.0018,
"reward": 1.7005953788757324,
"reward_std": 0.36450114846229553,
"rewards/": 7.264881134033203,
"rewards/math_compute_score": 0.3095238208770752,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 1625.8809814453125,
"epoch": 0.43914081145584727,
"grad_norm": 0.2596912738373031,
"kl": 0.0035247802734375,
"learning_rate": 6.225453351109934e-07,
"loss": -0.0137,
"reward": 1.7954614162445068,
"reward_std": 0.3381480872631073,
"rewards/": 7.263020992279053,
"rewards/math_compute_score": 0.4285714328289032,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 1478.6190185546875,
"epoch": 0.441527446300716,
"grad_norm": 0.286737248058935,
"kl": 0.0033416748046875,
"learning_rate": 6.187907499187356e-07,
"loss": 0.0023,
"reward": 1.675409197807312,
"reward_std": 0.4261094629764557,
"rewards/": 7.043713092803955,
"rewards/math_compute_score": 0.3333333432674408,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 1754.2857666015625,
"epoch": 0.4439140811455847,
"grad_norm": 0.31883404722867287,
"kl": 0.003570556640625,
"learning_rate": 6.150290521305745e-07,
"loss": 0.0083,
"reward": 1.1777018308639526,
"reward_std": 0.5951432585716248,
"rewards/": 6.55517578125,
"rewards/math_compute_score": -0.1666666716337204,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 1574.8333740234375,
"epoch": 0.44630071599045346,
"grad_norm": 0.2623240542860342,
"kl": 0.0038604736328125,
"learning_rate": 6.112604669781572e-07,
"loss": 0.0151,
"reward": 2.1099700927734375,
"reward_std": 0.4681752920150757,
"rewards/": 7.502232074737549,
"rewards/math_compute_score": 0.761904776096344,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 1458.71435546875,
"epoch": 0.4486873508353222,
"grad_norm": 0.2584305452560197,
"kl": 0.0034027099609375,
"learning_rate": 6.074852201055121e-07,
"loss": 0.0251,
"reward": 1.9345983266830444,
"reward_std": 0.48896414041519165,
"rewards/": 7.387277126312256,
"rewards/math_compute_score": 0.5714285969734192,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 1851.6905517578125,
"epoch": 0.4510739856801909,
"grad_norm": 0.23937901273037387,
"kl": 0.0029449462890625,
"learning_rate": 6.037035375555375e-07,
"loss": 0.0495,
"reward": 1.6072173118591309,
"reward_std": 0.6452977657318115,
"rewards/": 6.8932294845581055,
"rewards/math_compute_score": 0.2857142984867096,
"step": 189
},
{
"epoch": 0.45346062052505964,
"grad_norm": 0.2790819447741767,
"learning_rate": 5.999156457564685e-07,
"loss": 0.0699,
"step": 190
},
{
"epoch": 0.45346062052505964,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1715.4345703125,
"eval_kl": 0.0034027099609375,
"eval_loss": 0.029469896107912064,
"eval_reward": 1.7646177113056183,
"eval_reward_std": 0.5356989577412605,
"eval_rewards/": 6.751659631729126,
"eval_rewards/math_compute_score": 0.5178571455180645,
"eval_runtime": 90.5105,
"eval_samples_per_second": 0.232,
"eval_steps_per_second": 0.011,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 1566.2857666015625,
"epoch": 0.45584725536992843,
"grad_norm": 0.26673369685055265,
"kl": 0.0030975341796875,
"learning_rate": 5.961217715083184e-07,
"loss": -0.0247,
"reward": 1.439574122428894,
"reward_std": 0.4712224751710892,
"rewards/": 6.388346433639526,
"rewards/math_compute_score": 0.20238095708191395,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 1669.6905517578125,
"epoch": 0.45823389021479716,
"grad_norm": 0.288642662443768,
"kl": 0.00396728515625,
"learning_rate": 5.923221419693001e-07,
"loss": 0.055,
"reward": 1.3469215631484985,
"reward_std": 0.6074704527854919,
"rewards/": 6.3536553382873535,
"rewards/math_compute_score": 0.095238097012043,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 1724.9761962890625,
"epoch": 0.4606205250596659,
"grad_norm": 0.2442715266509315,
"kl": 0.0036773681640625,
"learning_rate": 5.885169846422241e-07,
"loss": 0.0315,
"reward": 1.85975182056427,
"reward_std": 0.6531968712806702,
"rewards/": 6.822567939758301,
"rewards/math_compute_score": 0.6190476417541504,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 1360.5238037109375,
"epoch": 0.4630071599045346,
"grad_norm": 0.24247369346072625,
"kl": 0.0033416748046875,
"learning_rate": 5.847065273608777e-07,
"loss": -0.0394,
"reward": 2.0778274536132812,
"reward_std": 0.3715685307979584,
"rewards/": 7.246279716491699,
"rewards/math_compute_score": 0.785714328289032,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 1426.1429443359375,
"epoch": 0.46539379474940334,
"grad_norm": 0.3036683766801178,
"kl": 0.003997802734375,
"learning_rate": 5.808909982763825e-07,
"loss": 0.0353,
"reward": 2.0206658840179443,
"reward_std": 0.4580070972442627,
"rewards/": 6.865234375,
"rewards/math_compute_score": 0.8095238208770752,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 1853.0,
"epoch": 0.4677804295942721,
"grad_norm": 0.24630009542019882,
"kl": 0.0035552978515625,
"learning_rate": 5.770706258435342e-07,
"loss": 0.0068,
"reward": 1.443489670753479,
"reward_std": 0.6034876108169556,
"rewards/": 6.836495876312256,
"rewards/math_compute_score": 0.095238097012043,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 1722.3809814453125,
"epoch": 0.4701670644391408,
"grad_norm": 0.2804061267617125,
"kl": 0.0036468505859375,
"learning_rate": 5.732456388071246e-07,
"loss": 0.0911,
"reward": 1.7890625,
"reward_std": 0.8560119271278381,
"rewards/": 7.231027126312256,
"rewards/math_compute_score": 0.4285714328289032,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 1708.0,
"epoch": 0.47255369928400953,
"grad_norm": 0.23422157987379022,
"kl": 0.0026702880859375,
"learning_rate": 5.694162661882443e-07,
"loss": 0.0098,
"reward": 1.418210506439209,
"reward_std": 0.5666205286979675,
"rewards/": 6.51962423324585,
"rewards/math_compute_score": 0.1428571492433548,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 1664.6905517578125,
"epoch": 0.47494033412887826,
"grad_norm": 0.27629085755921956,
"kl": 0.0035400390625,
"learning_rate": 5.655827372705711e-07,
"loss": 0.0046,
"reward": 1.649553656578064,
"reward_std": 0.35802432894706726,
"rewards/": 7.104910850524902,
"rewards/math_compute_score": 0.2857142984867096,
"step": 199
},
{
"epoch": 0.477326968973747,
"grad_norm": 0.26492233859737124,
"learning_rate": 5.617452815866409e-07,
"loss": 0.0269,
"step": 200
},
{
"epoch": 0.477326968973747,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1717.8690795898438,
"eval_kl": 0.0035858154296875,
"eval_loss": 0.018599843606352806,
"eval_reward": 1.7809989750385284,
"eval_reward_std": 0.5547928586602211,
"eval_rewards/": 6.857375502586365,
"eval_rewards/math_compute_score": 0.511904776096344,
"eval_runtime": 89.9494,
"eval_samples_per_second": 0.233,
"eval_steps_per_second": 0.011,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 1621.952392578125,
"epoch": 0.4797136038186158,
"grad_norm": 0.2851430586205352,
"kl": 0.00341033935546875,
"learning_rate": 5.579041289041045e-07,
"loss": 0.041,
"reward": 1.6838914155960083,
"reward_std": 0.5416805893182755,
"rewards/": 6.9908857345581055,
"rewards/math_compute_score": 0.357142873108387,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 1644.59521484375,
"epoch": 0.4821002386634845,
"grad_norm": 0.22556906762590703,
"kl": 0.0029449462890625,
"learning_rate": 5.540595092119708e-07,
"loss": -0.0015,
"reward": 1.4434523582458496,
"reward_std": 0.36706992983818054,
"rewards/": 6.836309432983398,
"rewards/math_compute_score": 0.095238097012043,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 1847.2857666015625,
"epoch": 0.48448687350835323,
"grad_norm": 0.2540668611221056,
"kl": 0.0035858154296875,
"learning_rate": 5.502116527068362e-07,
"loss": 0.0057,
"reward": 1.016341209411621,
"reward_std": 0.497652530670166,
"rewards/": 6.605515480041504,
"rewards/math_compute_score": -0.380952388048172,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 1572.71435546875,
"epoch": 0.48687350835322196,
"grad_norm": 0.26513430034135327,
"kl": 0.004119873046875,
"learning_rate": 5.463607897791005e-07,
"loss": 0.0501,
"reward": 1.8174108266830444,
"reward_std": 0.49505147337913513,
"rewards/": 6.991815567016602,
"rewards/math_compute_score": 0.523809552192688,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 1550.0,
"epoch": 0.4892601431980907,
"grad_norm": 0.2697202949991492,
"kl": 0.003753662109375,
"learning_rate": 5.425071509991736e-07,
"loss": 0.0289,
"reward": 1.7891370058059692,
"reward_std": 0.45509421825408936,
"rewards/": 6.850446701049805,
"rewards/math_compute_score": 0.523809552192688,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 1605.357177734375,
"epoch": 0.4916467780429594,
"grad_norm": 0.26407614115571215,
"kl": 0.003936767578125,
"learning_rate": 5.386509671036695e-07,
"loss": 0.0412,
"reward": 1.5840773582458496,
"reward_std": 0.44521623849868774,
"rewards/": 7.253720283508301,
"rewards/math_compute_score": 0.1666666716337204,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 1551.5714111328125,
"epoch": 0.49403341288782815,
"grad_norm": 0.24883737154825586,
"kl": 0.0035247802734375,
"learning_rate": 5.347924689815906e-07,
"loss": -0.023,
"reward": 1.7059524059295654,
"reward_std": 0.4412461817264557,
"rewards/": 6.910714626312256,
"rewards/math_compute_score": 0.4047619104385376,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 1707.59521484375,
"epoch": 0.4964200477326969,
"grad_norm": 0.26824875622279615,
"kl": 0.003326416015625,
"learning_rate": 5.309318876605042e-07,
"loss": 0.0433,
"reward": 1.2729166746139526,
"reward_std": 0.6343668103218079,
"rewards/": 6.745535850524902,
"rewards/math_compute_score": -0.095238097012043,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 1499.952392578125,
"epoch": 0.4988066825775656,
"grad_norm": 0.2552057559316855,
"kl": 0.0040283203125,
"learning_rate": 5.270694542927088e-07,
"loss": 0.0605,
"reward": 1.950334906578064,
"reward_std": 0.6608874797821045,
"rewards/": 7.085007667541504,
"rewards/math_compute_score": 0.6666666865348816,
"step": 209
},
{
"epoch": 0.5011933174224343,
"grad_norm": 0.26825451466871253,
"learning_rate": 5.232054001413941e-07,
"loss": 0.0836,
"step": 210
},
{
"epoch": 0.5011933174224343,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1666.1667175292969,
"eval_kl": 0.00366973876953125,
"eval_loss": 0.032919324934482574,
"eval_reward": 1.8185571432113647,
"eval_reward_std": 0.553740456700325,
"eval_rewards/": 6.854689955711365,
"eval_rewards/math_compute_score": 0.5595238246023655,
"eval_runtime": 89.3052,
"eval_samples_per_second": 0.235,
"eval_steps_per_second": 0.011,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 1667.202392578125,
"epoch": 0.5035799522673031,
"grad_norm": 0.2303140614245728,
"kl": 0.0036773681640625,
"learning_rate": 5.193399565667944e-07,
"loss": -0.0107,
"reward": 1.8566593527793884,
"reward_std": 0.5822675228118896,
"rewards/": 6.902343988418579,
"rewards/math_compute_score": 0.5952381044626236,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 1385.21435546875,
"epoch": 0.5059665871121718,
"grad_norm": 0.26853565694319703,
"kl": 0.004119873046875,
"learning_rate": 5.154733550123355e-07,
"loss": 0.0248,
"reward": 1.8717262744903564,
"reward_std": 0.28672024607658386,
"rewards/": 6.882440567016602,
"rewards/math_compute_score": 0.6190476417541504,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 1742.666748046875,
"epoch": 0.5083532219570406,
"grad_norm": 0.30033605387329526,
"kl": 0.004180908203125,
"learning_rate": 5.116058269907778e-07,
"loss": 0.0691,
"reward": 1.3494606018066406,
"reward_std": 0.6003190875053406,
"rewards/": 6.556826591491699,
"rewards/math_compute_score": 0.0476190485060215,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 1593.666748046875,
"epoch": 0.5107398568019093,
"grad_norm": 0.24942776683135082,
"kl": 0.00311279296875,
"learning_rate": 5.077376040703532e-07,
"loss": 0.0176,
"reward": 1.759412169456482,
"reward_std": 0.6285237073898315,
"rewards/": 6.7018232345581055,
"rewards/math_compute_score": 0.523809552192688,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 1642.8809814453125,
"epoch": 0.513126491646778,
"grad_norm": 0.24962515536899138,
"kl": 0.0032196044921875,
"learning_rate": 5.038689178609011e-07,
"loss": 0.0138,
"reward": 1.337165117263794,
"reward_std": 0.5744600892066956,
"rewards/": 6.400111675262451,
"rewards/math_compute_score": 0.0714285746216774,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 1622.40478515625,
"epoch": 0.5155131264916468,
"grad_norm": 0.30908520647310384,
"kl": 0.00433349609375,
"learning_rate": 5e-07,
"loss": 0.0464,
"reward": 1.8487166166305542,
"reward_std": 0.5867566466331482,
"rewards/": 6.576916217803955,
"rewards/math_compute_score": 0.6666666865348816,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 1577.09521484375,
"epoch": 0.5178997613365155,
"grad_norm": 0.32056320412809136,
"kl": 0.005523681640625,
"learning_rate": 4.961310821390989e-07,
"loss": 0.0627,
"reward": 1.755134105682373,
"reward_std": 0.5534998774528503,
"rewards/": 6.966145992279053,
"rewards/math_compute_score": 0.4523809552192688,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 1711.7381591796875,
"epoch": 0.5202863961813843,
"grad_norm": 0.2765241956924031,
"kl": 0.003692626953125,
"learning_rate": 4.922623959296468e-07,
"loss": 0.0325,
"reward": 1.5642856359481812,
"reward_std": 0.6228559017181396,
"rewards/": 7.059524059295654,
"rewards/math_compute_score": 0.190476194024086,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 1475.5714111328125,
"epoch": 0.522673031026253,
"grad_norm": 0.3255635099084251,
"kl": 0.004364013671875,
"learning_rate": 4.883941730092221e-07,
"loss": 0.0906,
"reward": 2.1429688930511475,
"reward_std": 0.4786287546157837,
"rewards/": 7.476748466491699,
"rewards/math_compute_score": 0.8095238208770752,
"step": 219
},
{
"epoch": 0.5250596658711217,
"grad_norm": 0.2590935031754961,
"learning_rate": 4.845266449876645e-07,
"loss": -0.0051,
"step": 220
},
{
"epoch": 0.5250596658711217,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1727.5059814453125,
"eval_kl": 0.004108428955078125,
"eval_loss": 0.06444509327411652,
"eval_reward": 1.802697241306305,
"eval_reward_std": 0.5698123574256897,
"eval_rewards/": 7.108723998069763,
"eval_rewards/math_compute_score": 0.47619048319756985,
"eval_runtime": 90.4516,
"eval_samples_per_second": 0.232,
"eval_steps_per_second": 0.011,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 1693.4761962890625,
"epoch": 0.5274463007159904,
"grad_norm": 0.24423068905597062,
"kl": 0.00394439697265625,
"learning_rate": 4.806600434332056e-07,
"loss": 0.014,
"reward": 1.3321336507797241,
"reward_std": 0.5291797071695328,
"rewards/": 6.660667896270752,
"rewards/math_compute_score": 0.0,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 1652.5238037109375,
"epoch": 0.5298329355608592,
"grad_norm": 0.27797504178630167,
"kl": 0.004608154296875,
"learning_rate": 4.76794599858606e-07,
"loss": 0.0612,
"reward": 1.6118676662445068,
"reward_std": 0.7166314721107483,
"rewards/": 6.726004600524902,
"rewards/math_compute_score": 0.3333333432674408,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 1610.0238037109375,
"epoch": 0.5322195704057279,
"grad_norm": 0.24481283197859993,
"kl": 0.0047607421875,
"learning_rate": 4.7293054570729126e-07,
"loss": 0.0891,
"reward": 1.7377232313156128,
"reward_std": 0.3753490149974823,
"rewards/": 6.402902126312256,
"rewards/math_compute_score": 0.5714285969734192,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 1788.6905517578125,
"epoch": 0.5346062052505967,
"grad_norm": 0.25068052940129437,
"kl": 0.00421142578125,
"learning_rate": 4.690681123394958e-07,
"loss": -0.0178,
"reward": 1.6801340579986572,
"reward_std": 0.6131526231765747,
"rewards/": 7.448288917541504,
"rewards/math_compute_score": 0.2380952388048172,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 1668.2857666015625,
"epoch": 0.5369928400954654,
"grad_norm": 0.24959786295483227,
"kl": 0.004180908203125,
"learning_rate": 4.6520753101840937e-07,
"loss": 0.0296,
"reward": 1.7870535850524902,
"reward_std": 0.4355296194553375,
"rewards/": 6.935267925262451,
"rewards/math_compute_score": 0.5,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 1850.119140625,
"epoch": 0.5393794749403341,
"grad_norm": 0.2773015045687595,
"kl": 0.00469970703125,
"learning_rate": 4.6134903289633066e-07,
"loss": 0.0505,
"reward": 1.5636905431747437,
"reward_std": 0.6416950821876526,
"rewards/": 7.342262268066406,
"rewards/math_compute_score": 0.1190476194024086,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 1882.1905517578125,
"epoch": 0.5417661097852029,
"grad_norm": 0.2532317353993275,
"kl": 0.0042724609375,
"learning_rate": 4.574928490008264e-07,
"loss": 0.0448,
"reward": 1.1148065328598022,
"reward_std": 0.6571987867355347,
"rewards/": 6.526413917541504,
"rewards/math_compute_score": -0.2380952388048172,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 1814.761962890625,
"epoch": 0.5441527446300716,
"grad_norm": 0.22033332515712287,
"kl": 0.0037384033203125,
"learning_rate": 4.536392102208997e-07,
"loss": 0.0088,
"reward": 1.4956845045089722,
"reward_std": 0.4954971373081207,
"rewards/": 7.097470283508301,
"rewards/math_compute_score": 0.095238097012043,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 1382.0,
"epoch": 0.5465393794749404,
"grad_norm": 0.26858072308905145,
"kl": 0.005218505859375,
"learning_rate": 4.4978834729316376e-07,
"loss": 0.0353,
"reward": 1.7789063453674316,
"reward_std": 0.5954192876815796,
"rewards/": 6.227864742279053,
"rewards/math_compute_score": 0.6666666865348816,
"step": 229
},
{
"epoch": 0.548926014319809,
"grad_norm": 0.3402534060613979,
"learning_rate": 4.459404907880292e-07,
"loss": 0.0897,
"step": 230
},
{
"epoch": 0.548926014319809,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1686.4048156738281,
"eval_kl": 0.00449371337890625,
"eval_loss": 0.023935753852128983,
"eval_reward": 1.7440082132816315,
"eval_reward_std": 0.5679794251918793,
"eval_rewards/": 6.767659664154053,
"eval_rewards/math_compute_score": 0.48809525929391384,
"eval_runtime": 89.8107,
"eval_samples_per_second": 0.234,
"eval_steps_per_second": 0.011,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 1321.3928833007812,
"epoch": 0.5513126491646778,
"grad_norm": 0.242406498971476,
"kl": 0.0046234130859375,
"learning_rate": 4.420958710958956e-07,
"loss": -0.0013,
"reward": 1.9190662503242493,
"reward_std": 0.3563975691795349,
"rewards/": 6.73818826675415,
"rewards/math_compute_score": 0.7142857313156128,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 1509.857177734375,
"epoch": 0.5536992840095465,
"grad_norm": 0.27883626832213565,
"kl": 0.0036468505859375,
"learning_rate": 4.3825471841335924e-07,
"loss": 0.0371,
"reward": 1.6071429252624512,
"reward_std": 0.4766731262207031,
"rewards/": 6.226190567016602,
"rewards/math_compute_score": 0.4523809552192688,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 1675.666748046875,
"epoch": 0.5560859188544153,
"grad_norm": 0.31965378501684055,
"kl": 0.00439453125,
"learning_rate": 4.3441726272942884e-07,
"loss": 0.0565,
"reward": 1.8153274059295654,
"reward_std": 0.6682077050209045,
"rewards/": 6.981399059295654,
"rewards/math_compute_score": 0.523809552192688,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 1410.6905517578125,
"epoch": 0.5584725536992841,
"grad_norm": 0.28553266771721203,
"kl": 0.005126953125,
"learning_rate": 4.305837338117557e-07,
"loss": 0.0325,
"reward": 1.7528274059295654,
"reward_std": 0.553020715713501,
"rewards/": 6.192708492279053,
"rewards/math_compute_score": 0.6428571343421936,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 1491.6905517578125,
"epoch": 0.5608591885441527,
"grad_norm": 0.28723777171974385,
"kl": 0.004852294921875,
"learning_rate": 4.267543611928754e-07,
"loss": 0.0305,
"reward": 2.1510417461395264,
"reward_std": 0.48044469952583313,
"rewards/": 7.326637268066406,
"rewards/math_compute_score": 0.8571428656578064,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 1592.452392578125,
"epoch": 0.5632458233890215,
"grad_norm": 0.25981926019228574,
"kl": 0.00439453125,
"learning_rate": 4.229293741564657e-07,
"loss": 0.0434,
"reward": 1.5803943872451782,
"reward_std": 0.7352063655853271,
"rewards/": 6.378162384033203,
"rewards/math_compute_score": 0.380952388048172,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 1855.166748046875,
"epoch": 0.5656324582338902,
"grad_norm": 0.24285139595600752,
"kl": 0.005462646484375,
"learning_rate": 4.1910900172361763e-07,
"loss": 0.0093,
"reward": 1.3181548118591309,
"reward_std": 0.49610888957977295,
"rewards/": 6.686011791229248,
"rewards/math_compute_score": -0.02380952425301075,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 1805.5714111328125,
"epoch": 0.568019093078759,
"grad_norm": 0.2538583793375119,
"kl": 0.00537109375,
"learning_rate": 4.1529347263912226e-07,
"loss": 0.0095,
"reward": 1.5097098350524902,
"reward_std": 0.4408361315727234,
"rewards/": 7.167596817016602,
"rewards/math_compute_score": 0.095238097012043,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 1417.1429443359375,
"epoch": 0.5704057279236276,
"grad_norm": 0.25434318216206253,
"kl": 0.004608154296875,
"learning_rate": 4.1148301535777587e-07,
"loss": 0.0364,
"reward": 2.183779716491699,
"reward_std": 0.37863248586654663,
"rewards/": 7.6808037757873535,
"rewards/math_compute_score": 0.8095238208770752,
"step": 239
},
{
"epoch": 0.5727923627684964,
"grad_norm": 0.34238218294249706,
"learning_rate": 4.076778580306999e-07,
"loss": -0.0382,
"step": 240
},
{
"epoch": 0.5727923627684964,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1660.4345703125,
"eval_kl": 0.00495147705078125,
"eval_loss": 0.05178676173090935,
"eval_reward": 1.8340391218662262,
"eval_reward_std": 0.5649153962731361,
"eval_rewards/": 6.813052415847778,
"eval_rewards/math_compute_score": 0.5892857238650322,
"eval_runtime": 89.4974,
"eval_samples_per_second": 0.235,
"eval_steps_per_second": 0.011,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 1437.3095397949219,
"epoch": 0.5751789976133651,
"grad_norm": 0.23354633615170214,
"kl": 0.0046844482421875,
"learning_rate": 4.038782284916816e-07,
"loss": 0.0166,
"reward": 1.68543541431427,
"reward_std": 0.4059063643217087,
"rewards/": 6.998605012893677,
"rewards/math_compute_score": 0.3571428656578064,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 1925.2857666015625,
"epoch": 0.5775656324582339,
"grad_norm": 0.2565081546038113,
"kl": 0.004547119140625,
"learning_rate": 4.000843542435315e-07,
"loss": 0.0438,
"reward": 1.1633185148239136,
"reward_std": 0.8174499273300171,
"rewards/": 6.578496932983398,
"rewards/math_compute_score": -0.190476194024086,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 1422.4285888671875,
"epoch": 0.5799522673031027,
"grad_norm": 0.29336911783349,
"kl": 0.005523681640625,
"learning_rate": 3.962964624444625e-07,
"loss": -0.0476,
"reward": 1.511476993560791,
"reward_std": 0.5816119909286499,
"rewards/": 6.319289684295654,
"rewards/math_compute_score": 0.3095238208770752,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 1830.71435546875,
"epoch": 0.5823389021479713,
"grad_norm": 0.2567772027773301,
"kl": 0.004638671875,
"learning_rate": 3.9251477989448795e-07,
"loss": 0.0276,
"reward": 1.3412946462631226,
"reward_std": 0.5368376970291138,
"rewards/": 6.801711559295654,
"rewards/math_compute_score": -0.02380952425301075,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 1410.90478515625,
"epoch": 0.5847255369928401,
"grad_norm": 0.28779373915724527,
"kl": 0.005950927734375,
"learning_rate": 3.8873953302184283e-07,
"loss": 0.0955,
"reward": 2.0691964626312256,
"reward_std": 0.4866236746311188,
"rewards/": 7.107887268066406,
"rewards/math_compute_score": 0.8095238208770752,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 1704.1905517578125,
"epoch": 0.5871121718377088,
"grad_norm": 0.22883743400607778,
"kl": 0.004180908203125,
"learning_rate": 3.849709478694255e-07,
"loss": -0.0272,
"reward": 1.5712053775787354,
"reward_std": 0.5555210709571838,
"rewards/": 6.522693634033203,
"rewards/math_compute_score": 0.3333333432674408,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 1329.452392578125,
"epoch": 0.5894988066825776,
"grad_norm": 0.287861136892914,
"kl": 0.00537109375,
"learning_rate": 3.8120925008126454e-07,
"loss": -0.0196,
"reward": 1.8973217010498047,
"reward_std": 0.2906087338924408,
"rewards/": 7.296131134033203,
"rewards/math_compute_score": 0.5476190447807312,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 1574.7381591796875,
"epoch": 0.5918854415274463,
"grad_norm": 0.2670606131074928,
"kl": 0.005279541015625,
"learning_rate": 3.7745466488900657e-07,
"loss": 0.0051,
"reward": 1.6572545766830444,
"reward_std": 0.5282899141311646,
"rewards/": 6.762463092803955,
"rewards/math_compute_score": 0.380952388048172,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 1842.8333740234375,
"epoch": 0.594272076372315,
"grad_norm": 0.2512682576888719,
"kl": 0.004180908203125,
"learning_rate": 3.7370741709843256e-07,
"loss": 0.0202,
"reward": 1.1577380895614624,
"reward_std": 0.4659985601902008,
"rewards/": 6.741071701049805,
"rewards/math_compute_score": -0.2380952388048172,
"step": 249
},
{
"epoch": 0.5966587112171837,
"grad_norm": 0.2857471624345884,
"learning_rate": 3.69967731075996e-07,
"loss": 0.0114,
"step": 250
},
{
"epoch": 0.5966587112171837,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1692.4345397949219,
"eval_kl": 0.00485992431640625,
"eval_loss": 0.017853517085313797,
"eval_reward": 1.7054944038391113,
"eval_reward_std": 0.5782012939453125,
"eval_rewards/": 6.575090765953064,
"eval_rewards/math_compute_score": 0.48809524066746235,
"eval_runtime": 89.5769,
"eval_samples_per_second": 0.234,
"eval_steps_per_second": 0.011,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 1610.3928833007812,
"epoch": 0.5990453460620525,
"grad_norm": 0.270009406471592,
"kl": 0.004669189453125,
"learning_rate": 3.6623583073538965e-07,
"loss": 0.034,
"reward": 1.5946521162986755,
"reward_std": 0.5078227818012238,
"rewards/": 6.449451208114624,
"rewards/math_compute_score": 0.3809523954987526,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 1729.0,
"epoch": 0.6014319809069213,
"grad_norm": 0.22329451936667,
"kl": 0.0040283203125,
"learning_rate": 3.625119395241386e-07,
"loss": -0.0209,
"reward": 1.681398868560791,
"reward_std": 0.542913019657135,
"rewards/": 7.168899059295654,
"rewards/math_compute_score": 0.3095238208770752,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 1432.4285888671875,
"epoch": 0.60381861575179,
"grad_norm": 0.2991176866926496,
"kl": 0.00604248046875,
"learning_rate": 3.5879628041022135e-07,
"loss": 0.09,
"reward": 2.0321431159973145,
"reward_std": 0.5620549917221069,
"rewards/": 7.684524059295654,
"rewards/math_compute_score": 0.6190476417541504,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 1317.0714111328125,
"epoch": 0.6062052505966588,
"grad_norm": 0.22409602196187103,
"kl": 0.00433349609375,
"learning_rate": 3.550890758687198e-07,
"loss": -0.0005,
"reward": 1.9829614162445068,
"reward_std": 0.2656542956829071,
"rewards/": 6.771949768066406,
"rewards/math_compute_score": 0.785714328289032,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 1662.4761962890625,
"epoch": 0.6085918854415274,
"grad_norm": 0.2603499504911004,
"kl": 0.005218505859375,
"learning_rate": 3.513905478684978e-07,
"loss": 0.0429,
"reward": 1.5799851417541504,
"reward_std": 0.5886630415916443,
"rewards/": 6.566592216491699,
"rewards/math_compute_score": 0.3333333432674408,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 1513.1429443359375,
"epoch": 0.6109785202863962,
"grad_norm": 0.2523855873281953,
"kl": 0.005035400390625,
"learning_rate": 3.47700917858912e-07,
"loss": 0.0282,
"reward": 1.9692708253860474,
"reward_std": 0.30754101276397705,
"rewards/": 7.370163917541504,
"rewards/math_compute_score": 0.6190476417541504,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 1599.357177734375,
"epoch": 0.6133651551312649,
"grad_norm": 0.2641504670239811,
"kl": 0.005340576171875,
"learning_rate": 3.440204067565511e-07,
"loss": 0.0331,
"reward": 1.9764137268066406,
"reward_std": 0.4379138946533203,
"rewards/": 7.024925708770752,
"rewards/math_compute_score": 0.7142857313156128,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 1846.5714111328125,
"epoch": 0.6157517899761337,
"grad_norm": 0.23594897144723076,
"kl": 0.00494384765625,
"learning_rate": 3.4034923493201007e-07,
"loss": 0.0141,
"reward": 1.4104912281036377,
"reward_std": 0.7667174935340881,
"rewards/": 6.385788917541504,
"rewards/math_compute_score": 0.1666666716337204,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 1564.8809814453125,
"epoch": 0.6181384248210023,
"grad_norm": 0.27947145598838724,
"kl": 0.0057373046875,
"learning_rate": 3.366876221966939e-07,
"loss": 0.0397,
"reward": 1.8209078311920166,
"reward_std": 0.4953598976135254,
"rewards/": 7.009300708770752,
"rewards/math_compute_score": 0.523809552192688,
"step": 259
},
{
"epoch": 0.6205250596658711,
"grad_norm": 0.24582056613485512,
"learning_rate": 3.330357877896577e-07,
"loss": -0.0136,
"step": 260
},
{
"epoch": 0.6205250596658711,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1679.5833435058594,
"eval_kl": 0.005035400390625,
"eval_loss": 0.019856100901961327,
"eval_reward": 1.8213519155979156,
"eval_reward_std": 0.5068388804793358,
"eval_rewards/": 6.773425936698914,
"eval_rewards/math_compute_score": 0.5833333469927311,
"eval_runtime": 88.9934,
"eval_samples_per_second": 0.236,
"eval_steps_per_second": 0.011,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 1454.7738647460938,
"epoch": 0.6229116945107399,
"grad_norm": 0.29179311342786673,
"kl": 0.0057525634765625,
"learning_rate": 3.2939395036447875e-07,
"loss": 0.0543,
"reward": 1.9630953073501587,
"reward_std": 0.4546659290790558,
"rewards/": 7.339285850524902,
"rewards/math_compute_score": 0.6190476268529892,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 1676.71435546875,
"epoch": 0.6252983293556086,
"grad_norm": 0.26950110968223,
"kl": 0.00616455078125,
"learning_rate": 3.2576232797616555e-07,
"loss": 0.0363,
"reward": 1.8032739162445068,
"reward_std": 0.540711522102356,
"rewards/": 6.6354169845581055,
"rewards/math_compute_score": 0.5952380895614624,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 1341.952392578125,
"epoch": 0.6276849642004774,
"grad_norm": 0.29281930142683044,
"kl": 0.005645751953125,
"learning_rate": 3.221411380681007e-07,
"loss": 0.0499,
"reward": 1.8809523582458496,
"reward_std": 0.4906759262084961,
"rewards/": 6.738095283508301,
"rewards/math_compute_score": 0.6666666865348816,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 1802.3095703125,
"epoch": 0.630071599045346,
"grad_norm": 0.23358354567385772,
"kl": 0.004974365234375,
"learning_rate": 3.1853059745902285e-07,
"loss": 0.0414,
"reward": 1.7011163234710693,
"reward_std": 0.6680436134338379,
"rewards/": 7.172246932983398,
"rewards/math_compute_score": 0.3333333432674408,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 1440.1905517578125,
"epoch": 0.6324582338902148,
"grad_norm": 0.22255436445316382,
"kl": 0.004638671875,
"learning_rate": 3.1493092233004277e-07,
"loss": -0.0207,
"reward": 2.206026792526245,
"reward_std": 0.30084043741226196,
"rewards/": 7.22061014175415,
"rewards/math_compute_score": 0.9523809552192688,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 1661.6429443359375,
"epoch": 0.6348448687350835,
"grad_norm": 0.24901623510825996,
"kl": 0.005950927734375,
"learning_rate": 3.1134232821170197e-07,
"loss": 0.0692,
"reward": 1.8057291507720947,
"reward_std": 0.6432158946990967,
"rewards/": 6.933407783508301,
"rewards/math_compute_score": 0.523809552192688,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 1834.8333740234375,
"epoch": 0.6372315035799523,
"grad_norm": 0.2693858842162648,
"kl": 0.00518798828125,
"learning_rate": 3.0776502997106523e-07,
"loss": 0.0186,
"reward": 1.266369104385376,
"reward_std": 0.5596219897270203,
"rewards/": 7.09375,
"rewards/math_compute_score": -0.190476194024086,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 1539.6190185546875,
"epoch": 0.639618138424821,
"grad_norm": 0.33229466794859597,
"kl": 0.0054931640625,
"learning_rate": 3.0419924179885767e-07,
"loss": 0.0968,
"reward": 1.6295758485794067,
"reward_std": 0.6508839130401611,
"rewards/": 7.0050225257873535,
"rewards/math_compute_score": 0.2857142984867096,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 1653.7857666015625,
"epoch": 0.6420047732696897,
"grad_norm": 0.2518475987768706,
"kl": 0.0059814453125,
"learning_rate": 3.006451771966383e-07,
"loss": 0.0212,
"reward": 1.811290979385376,
"reward_std": 0.7287031412124634,
"rewards/": 7.627883434295654,
"rewards/math_compute_score": 0.3571428656578064,
"step": 269
},
{
"epoch": 0.6443914081145584,
"grad_norm": 0.26996644513187057,
"learning_rate": 2.97103048964018e-07,
"loss": 0.0185,
"step": 270
},
{
"epoch": 0.6443914081145584,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1707.8691101074219,
"eval_kl": 0.00543212890625,
"eval_loss": 0.0052840313874185085,
"eval_reward": 1.76309534907341,
"eval_reward_std": 0.5642824694514275,
"eval_rewards/": 6.74404776096344,
"eval_rewards/math_compute_score": 0.5178571529686451,
"eval_runtime": 104.2716,
"eval_samples_per_second": 0.201,
"eval_steps_per_second": 0.01,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 1718.3095703125,
"epoch": 0.6467780429594272,
"grad_norm": 0.2669365684764653,
"kl": 0.00506591796875,
"learning_rate": 2.935730691859172e-07,
"loss": 0.0291,
"reward": 1.2227492928504944,
"reward_std": 0.5440776199102402,
"rewards/": 6.256603479385376,
"rewards/math_compute_score": -0.0357142873108387,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 1737.40478515625,
"epoch": 0.649164677804296,
"grad_norm": 0.27901330162610016,
"kl": 0.00506591796875,
"learning_rate": 2.900554492198677e-07,
"loss": 0.064,
"reward": 1.4799107313156128,
"reward_std": 0.568335235118866,
"rewards/": 7.018601417541504,
"rewards/math_compute_score": 0.095238097012043,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 1645.59521484375,
"epoch": 0.6515513126491647,
"grad_norm": 0.3047482282945327,
"kl": 0.00592041015625,
"learning_rate": 2.865503996833577e-07,
"loss": 0.0249,
"reward": 1.6728236675262451,
"reward_std": 0.7209111452102661,
"rewards/": 6.840309143066406,
"rewards/math_compute_score": 0.380952388048172,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 1540.6190185546875,
"epoch": 0.6539379474940334,
"grad_norm": 0.28896859955485643,
"kl": 0.005584716796875,
"learning_rate": 2.8305813044122093e-07,
"loss": -0.0273,
"reward": 1.7266185283660889,
"reward_std": 0.47260189056396484,
"rewards/": 6.633091449737549,
"rewards/math_compute_score": 0.5,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 1646.6190185546875,
"epoch": 0.6563245823389021,
"grad_norm": 0.29507920195415793,
"kl": 0.006500244140625,
"learning_rate": 2.7957885059307095e-07,
"loss": 0.0727,
"reward": 1.7315198183059692,
"reward_std": 0.729472815990448,
"rewards/": 6.562360763549805,
"rewards/math_compute_score": 0.523809552192688,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 1560.9761962890625,
"epoch": 0.6587112171837709,
"grad_norm": 0.259247847289947,
"kl": 0.0059814453125,
"learning_rate": 2.761127684607811e-07,
"loss": 0.0022,
"reward": 1.774553656578064,
"reward_std": 0.343158096075058,
"rewards/": 7.348958492279053,
"rewards/math_compute_score": 0.380952388048172,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 1427.8333740234375,
"epoch": 0.6610978520286396,
"grad_norm": 0.33776370170761744,
"kl": 0.00616455078125,
"learning_rate": 2.7266009157601223e-07,
"loss": 0.0646,
"reward": 1.768303632736206,
"reward_std": 0.5567624568939209,
"rewards/": 6.936756134033203,
"rewards/math_compute_score": 0.4761904776096344,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 1616.3333740234375,
"epoch": 0.6634844868735084,
"grad_norm": 0.24395736200501855,
"kl": 0.005279541015625,
"learning_rate": 2.6922102666778546e-07,
"loss": 0.0763,
"reward": 1.7422620058059692,
"reward_std": 0.58295738697052,
"rewards/": 7.282738208770752,
"rewards/math_compute_score": 0.3571428656578064,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 1870.666748046875,
"epoch": 0.665871121718377,
"grad_norm": 0.2711602572939092,
"kl": 0.005645751953125,
"learning_rate": 2.65795779650105e-07,
"loss": 0.0126,
"reward": 1.3323661088943481,
"reward_std": 0.6261304616928101,
"rewards/": 6.852306842803955,
"rewards/math_compute_score": -0.0476190485060215,
"step": 279
},
{
"epoch": 0.6682577565632458,
"grad_norm": 0.2538680866763002,
"learning_rate": 2.623845556096288e-07,
"loss": 0.0018,
"step": 280
},
{
"epoch": 0.6682577565632458,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1675.9226684570312,
"eval_kl": 0.0055694580078125,
"eval_loss": 0.021293407306075096,
"eval_reward": 1.7486564218997955,
"eval_reward_std": 0.4653979241847992,
"eval_rewards/": 6.719472408294678,
"eval_rewards/math_compute_score": 0.5059523964300752,
"eval_runtime": 89.391,
"eval_samples_per_second": 0.235,
"eval_steps_per_second": 0.011,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 1709.9761962890625,
"epoch": 0.6706443914081146,
"grad_norm": 0.3114946337877643,
"kl": 0.0066375732421875,
"learning_rate": 2.589875587933892e-07,
"loss": 0.0322,
"reward": 1.6709263324737549,
"reward_std": 0.6808830201625824,
"rewards/": 7.164155721664429,
"rewards/math_compute_score": 0.2976190522313118,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 1118.0,
"epoch": 0.6730310262529833,
"grad_norm": 0.28538793372981025,
"kl": 0.0059814453125,
"learning_rate": 2.5560499259656323e-07,
"loss": 0.0625,
"reward": 2.193861961364746,
"reward_std": 0.3232349455356598,
"rewards/": 6.969308376312256,
"rewards/math_compute_score": 1.0,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 1450.7381591796875,
"epoch": 0.6754176610978521,
"grad_norm": 0.2721947819426561,
"kl": 0.006103515625,
"learning_rate": 2.522370595502954e-07,
"loss": 0.0198,
"reward": 1.9322172403335571,
"reward_std": 0.42703866958618164,
"rewards/": 6.803943634033203,
"rewards/math_compute_score": 0.7142857313156128,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 1785.21435546875,
"epoch": 0.6778042959427207,
"grad_norm": 0.2394368126047437,
"kl": 0.006134033203125,
"learning_rate": 2.4888396130956943e-07,
"loss": 0.0365,
"reward": 1.4974703788757324,
"reward_std": 0.7022044658660889,
"rewards/": 6.91592264175415,
"rewards/math_compute_score": 0.1428571492433548,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 1556.40478515625,
"epoch": 0.6801909307875895,
"grad_norm": 0.28935547227648567,
"kl": 0.005462646484375,
"learning_rate": 2.455458986411356e-07,
"loss": 0.0616,
"reward": 1.8514881134033203,
"reward_std": 0.4598250687122345,
"rewards/": 6.971726417541504,
"rewards/math_compute_score": 0.5714285969734192,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 1276.2857666015625,
"epoch": 0.6825775656324582,
"grad_norm": 0.30697866631475274,
"kl": 0.00701904296875,
"learning_rate": 2.4222307141148906e-07,
"loss": -0.0098,
"reward": 1.4965217113494873,
"reward_std": 0.5237927436828613,
"rewards/": 6.054036617279053,
"rewards/math_compute_score": 0.3571428656578064,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 1645.90478515625,
"epoch": 0.684964200477327,
"grad_norm": 0.2560352818812391,
"kl": 0.004364013671875,
"learning_rate": 2.3891567857490367e-07,
"loss": 0.0384,
"reward": 1.592038631439209,
"reward_std": 0.7362250685691833,
"rewards/": 5.864955425262451,
"rewards/math_compute_score": 0.523809552192688,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 1456.261962890625,
"epoch": 0.6873508353221957,
"grad_norm": 0.2542182384142676,
"kl": 0.005615234375,
"learning_rate": 2.3562391816151805e-07,
"loss": 0.0389,
"reward": 1.9461311101913452,
"reward_std": 0.38350486755371094,
"rewards/": 7.444940567016602,
"rewards/math_compute_score": 0.5714285969734192,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 1732.4761962890625,
"epoch": 0.6897374701670644,
"grad_norm": 0.2957765896159597,
"kl": 0.0059814453125,
"learning_rate": 2.3234798726548044e-07,
"loss": 0.0439,
"reward": 1.5767114162445068,
"reward_std": 0.6308966279029846,
"rewards/": 6.8359375,
"rewards/math_compute_score": 0.261904776096344,
"step": 289
},
{
"epoch": 0.6921241050119332,
"grad_norm": 0.24595123987891096,
"learning_rate": 2.2908808203314633e-07,
"loss": 0.0576,
"step": 290
},
{
"epoch": 0.6921241050119332,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1689.4702758789062,
"eval_kl": 0.00591278076171875,
"eval_loss": 0.024828355759382248,
"eval_reward": 1.7160040140151978,
"eval_reward_std": 0.5452041104435921,
"eval_rewards/": 6.651448607444763,
"eval_rewards/math_compute_score": 0.48214287124574184,
"eval_runtime": 89.5372,
"eval_samples_per_second": 0.235,
"eval_steps_per_second": 0.011,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 1495.4642944335938,
"epoch": 0.6945107398568019,
"grad_norm": 0.30325016746870037,
"kl": 0.005462646484375,
"learning_rate": 2.258443976513345e-07,
"loss": 0.0171,
"reward": 1.9273810386657715,
"reward_std": 0.39634837210178375,
"rewards/": 7.208333492279053,
"rewards/math_compute_score": 0.6071428656578064,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 1870.71435546875,
"epoch": 0.6968973747016707,
"grad_norm": 0.24295700768116915,
"kl": 0.0052490234375,
"learning_rate": 2.2261712833564088e-07,
"loss": 0.045,
"reward": 1.1304640769958496,
"reward_std": 0.6737284064292908,
"rewards/": 6.3189873695373535,
"rewards/math_compute_score": -0.1666666716337204,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 1458.2381591796875,
"epoch": 0.6992840095465394,
"grad_norm": 0.2770843906605435,
"kl": 0.006378173828125,
"learning_rate": 2.1940646731880885e-07,
"loss": 0.006,
"reward": 1.776116132736206,
"reward_std": 0.45735234022140503,
"rewards/": 6.975818634033203,
"rewards/math_compute_score": 0.4761904776096344,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 1907.047607421875,
"epoch": 0.7016706443914081,
"grad_norm": 0.27407827255457246,
"kl": 0.0057373046875,
"learning_rate": 2.1621260683916005e-07,
"loss": 0.0246,
"reward": 1.1069941520690918,
"reward_std": 0.706874430179596,
"rewards/": 6.487351417541504,
"rewards/math_compute_score": -0.2380952388048172,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 1427.8809814453125,
"epoch": 0.7040572792362768,
"grad_norm": 0.2531898073661023,
"kl": 0.0052490234375,
"learning_rate": 2.1303573812908383e-07,
"loss": 0.0217,
"reward": 1.809449315071106,
"reward_std": 0.3774469792842865,
"rewards/": 7.332961559295654,
"rewards/math_compute_score": 0.4285714328289032,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 1773.59521484375,
"epoch": 0.7064439140811456,
"grad_norm": 0.2651426374809382,
"kl": 0.005706787109375,
"learning_rate": 2.0987605140358822e-07,
"loss": -0.0014,
"reward": 1.1260230541229248,
"reward_std": 0.657254159450531,
"rewards/": 5.915829658508301,
"rewards/math_compute_score": -0.0714285746216774,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 1461.40478515625,
"epoch": 0.7088305489260143,
"grad_norm": 0.25548482546819784,
"kl": 0.00592041015625,
"learning_rate": 2.0673373584890846e-07,
"loss": 0.0298,
"reward": 2.0536458492279053,
"reward_std": 0.39229947328567505,
"rewards/": 7.030134201049805,
"rewards/math_compute_score": 0.8095238208770752,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 1537.547607421875,
"epoch": 0.711217183770883,
"grad_norm": 0.28766204010964963,
"kl": 0.00634765625,
"learning_rate": 2.0360897961118246e-07,
"loss": 0.0335,
"reward": 1.798586368560791,
"reward_std": 0.5106884241104126,
"rewards/": 7.183407783508301,
"rewards/math_compute_score": 0.4523809552192688,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 1209.4285888671875,
"epoch": 0.7136038186157518,
"grad_norm": 0.2739533568578717,
"kl": 0.006439208984375,
"learning_rate": 2.0050196978518318e-07,
"loss": 0.0196,
"reward": 2.1623237133026123,
"reward_std": 0.330209881067276,
"rewards/": 7.002094745635986,
"rewards/math_compute_score": 0.9523809552192688,
"step": 299
},
{
"epoch": 0.7159904534606205,
"grad_norm": 0.28060465982015764,
"learning_rate": 1.9741289240311754e-07,
"loss": 0.0362,
"step": 300
},
{
"epoch": 0.7159904534606205,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1636.3095397949219,
"eval_kl": 0.0059051513671875,
"eval_loss": 0.04656996577978134,
"eval_reward": 1.842539221048355,
"eval_reward_std": 0.5257963240146637,
"eval_rewards/": 6.712695956230164,
"eval_rewards/math_compute_score": 0.6250000149011612,
"eval_runtime": 88.3841,
"eval_samples_per_second": 0.238,
"eval_steps_per_second": 0.011,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 1566.7381591796875,
"epoch": 0.7183770883054893,
"grad_norm": 0.2791601521311669,
"kl": 0.00567626953125,
"learning_rate": 1.9434193242348706e-07,
"loss": 0.0155,
"reward": 1.7760975360870361,
"reward_std": 0.5548774749040604,
"rewards/": 7.070963621139526,
"rewards/math_compute_score": 0.45238097012043,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 1478.6190185546875,
"epoch": 0.720763723150358,
"grad_norm": 0.2767593330017176,
"kl": 0.005950927734375,
"learning_rate": 1.9128927372001453e-07,
"loss": -0.0091,
"reward": 2.0397322177886963,
"reward_std": 0.43883827328681946,
"rewards/": 7.436756134033203,
"rewards/math_compute_score": 0.6904761791229248,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 1737.261962890625,
"epoch": 0.7231503579952268,
"grad_norm": 0.277582250530735,
"kl": 0.0068359375,
"learning_rate": 1.8825509907063326e-07,
"loss": 0.0513,
"reward": 1.3215030431747437,
"reward_std": 0.6728135347366333,
"rewards/": 6.2265625,
"rewards/math_compute_score": 0.095238097012043,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 1751.5238037109375,
"epoch": 0.7255369928400954,
"grad_norm": 0.2462323539587146,
"kl": 0.005096435546875,
"learning_rate": 1.8523959014654406e-07,
"loss": 0.0345,
"reward": 1.5617932081222534,
"reward_std": 0.7247648239135742,
"rewards/": 6.8565850257873535,
"rewards/math_compute_score": 0.2380952388048172,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 1656.761962890625,
"epoch": 0.7279236276849642,
"grad_norm": 0.2560096521866889,
"kl": 0.006744384765625,
"learning_rate": 1.822429275013374e-07,
"loss": -0.0002,
"reward": 1.9063987731933594,
"reward_std": 0.5774410367012024,
"rewards/": 7.1510419845581055,
"rewards/math_compute_score": 0.5952380895614624,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 1543.761962890625,
"epoch": 0.7303102625298329,
"grad_norm": 0.25382406268426466,
"kl": 0.00506591796875,
"learning_rate": 1.7926529056018297e-07,
"loss": 0.0603,
"reward": 1.8665552139282227,
"reward_std": 0.36209362745285034,
"rewards/": 7.047060966491699,
"rewards/math_compute_score": 0.5714285969734192,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 1739.8333740234375,
"epoch": 0.7326968973747017,
"grad_norm": 0.2836473746570401,
"kl": 0.006866455078125,
"learning_rate": 1.763068576090862e-07,
"loss": 0.0395,
"reward": 1.5139509439468384,
"reward_std": 0.6646405458450317,
"rewards/": 6.712611675262451,
"rewards/math_compute_score": 0.2142857164144516,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 1637.8809814453125,
"epoch": 0.7350835322195705,
"grad_norm": 0.2569474942305258,
"kl": 0.006317138671875,
"learning_rate": 1.7336780578421418e-07,
"loss": 0.0301,
"reward": 1.4991816282272339,
"reward_std": 0.4310193359851837,
"rewards/": 6.9244794845581055,
"rewards/math_compute_score": 0.1428571492433548,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 1488.5,
"epoch": 0.7374701670644391,
"grad_norm": 0.3024850487650047,
"kl": 0.0068359375,
"learning_rate": 1.7044831106128864e-07,
"loss": -0.0418,
"reward": 1.8633928298950195,
"reward_std": 0.3402189016342163,
"rewards/": 7.03125,
"rewards/math_compute_score": 0.5714285969734192,
"step": 309
},
{
"epoch": 0.7398568019093079,
"grad_norm": 0.28358189812459517,
"learning_rate": 1.6754854824504988e-07,
"loss": 0.0584,
"step": 310
},
{
"epoch": 0.7398568019093079,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1617.6310119628906,
"eval_kl": 0.00592803955078125,
"eval_loss": 0.036280252039432526,
"eval_reward": 1.829003930091858,
"eval_reward_std": 0.5586381033062935,
"eval_rewards/": 6.859305262565613,
"eval_rewards/math_compute_score": 0.5714285746216774,
"eval_runtime": 87.8182,
"eval_samples_per_second": 0.239,
"eval_steps_per_second": 0.011,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 1745.5119018554688,
"epoch": 0.7422434367541766,
"grad_norm": 0.23985256079431821,
"kl": 0.006500244140625,
"learning_rate": 1.6466869095879076e-07,
"loss": 0.0432,
"reward": 1.5979260206222534,
"reward_std": 0.6946238726377487,
"rewards/": 6.942010879516602,
"rewards/math_compute_score": 0.261904776096344,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 1562.452392578125,
"epoch": 0.7446300715990454,
"grad_norm": 0.301641383304508,
"kl": 0.00604248046875,
"learning_rate": 1.618089116339601e-07,
"loss": 0.074,
"reward": 1.9055060148239136,
"reward_std": 0.48013433814048767,
"rewards/": 7.527529716491699,
"rewards/math_compute_score": 0.5,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 1563.0238037109375,
"epoch": 0.747016706443914,
"grad_norm": 0.2821395913132352,
"kl": 0.005889892578125,
"learning_rate": 1.5896938149983907e-07,
"loss": 0.0421,
"reward": 1.8350447416305542,
"reward_std": 0.41543343663215637,
"rewards/": 6.889509201049805,
"rewards/math_compute_score": 0.5714285969734192,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 1611.166748046875,
"epoch": 0.7494033412887828,
"grad_norm": 0.28629589377089504,
"kl": 0.006561279296875,
"learning_rate": 1.561502705732883e-07,
"loss": 0.0339,
"reward": 1.900520920753479,
"reward_std": 0.5674861669540405,
"rewards/": 7.788318634033203,
"rewards/math_compute_score": 0.4285714328289032,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 1294.2857666015625,
"epoch": 0.7517899761336515,
"grad_norm": 0.29565718515676304,
"kl": 0.006195068359375,
"learning_rate": 1.5335174764856907e-07,
"loss": 0.0469,
"reward": 2.021242380142212,
"reward_std": 0.4866698682308197,
"rewards/": 7.05859375,
"rewards/math_compute_score": 0.761904776096344,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 1552.3333740234375,
"epoch": 0.7541766109785203,
"grad_norm": 0.2699783744211519,
"kl": 0.00732421875,
"learning_rate": 1.505739802872351e-07,
"loss": -0.0036,
"reward": 1.7238560914993286,
"reward_std": 0.5615432262420654,
"rewards/": 6.809756278991699,
"rewards/math_compute_score": 0.4523809552192688,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 1412.3095703125,
"epoch": 0.7565632458233891,
"grad_norm": 0.25544319193529125,
"kl": 0.005828857421875,
"learning_rate": 1.4781713480810182e-07,
"loss": 0.0326,
"reward": 2.0182292461395264,
"reward_std": 0.21248388290405273,
"rewards/": 7.4244794845581055,
"rewards/math_compute_score": 0.6666666865348816,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 1879.5238037109375,
"epoch": 0.7589498806682577,
"grad_norm": 0.2858571715960386,
"kl": 0.00732421875,
"learning_rate": 1.4508137627728628e-07,
"loss": 0.0278,
"reward": 1.18154776096344,
"reward_std": 0.6407575607299805,
"rewards/": 6.574404716491699,
"rewards/math_compute_score": -0.1666666716337204,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 1535.09521484375,
"epoch": 0.7613365155131265,
"grad_norm": 0.2434413985432686,
"kl": 0.00579833984375,
"learning_rate": 1.4236686849832496e-07,
"loss": 0.0309,
"reward": 1.8485863208770752,
"reward_std": 0.5873944759368896,
"rewards/": 7.147693634033203,
"rewards/math_compute_score": 0.523809552192688,
"step": 319
},
{
"epoch": 0.7637231503579952,
"grad_norm": 0.2696031278018177,
"learning_rate": 1.3967377400236514e-07,
"loss": 0.0202,
"step": 320
},
{
"epoch": 0.7637231503579952,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1639.8214721679688,
"eval_kl": 0.00627899169921875,
"eval_loss": 0.04983534291386604,
"eval_reward": 1.8444220423698425,
"eval_reward_std": 0.505550891160965,
"eval_rewards/": 6.960205316543579,
"eval_rewards/math_compute_score": 0.5654762051999569,
"eval_runtime": 88.8251,
"eval_samples_per_second": 0.236,
"eval_steps_per_second": 0.011,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 1541.7142944335938,
"epoch": 0.766109785202864,
"grad_norm": 0.23918547245411584,
"kl": 0.0065765380859375,
"learning_rate": 1.370022540384347e-07,
"loss": 0.0411,
"reward": 1.8474704027175903,
"reward_std": 0.53986856341362,
"rewards/": 7.046875,
"rewards/math_compute_score": 0.5476190522313118,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 1663.0,
"epoch": 0.7684964200477327,
"grad_norm": 0.26591156051162185,
"kl": 0.00579833984375,
"learning_rate": 1.3435246856378525e-07,
"loss": -0.0037,
"reward": 1.948484182357788,
"reward_std": 0.6001127362251282,
"rewards/": 7.17099142074585,
"rewards/math_compute_score": 0.6428571343421936,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 1639.1905517578125,
"epoch": 0.7708830548926014,
"grad_norm": 0.27667895800229375,
"kl": 0.00653076171875,
"learning_rate": 1.3172457623431705e-07,
"loss": 0.0247,
"reward": 1.505738377571106,
"reward_std": 0.7357364892959595,
"rewards/": 6.766787528991699,
"rewards/math_compute_score": 0.190476194024086,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 1452.0,
"epoch": 0.7732696897374701,
"grad_norm": 0.2607746555497128,
"kl": 0.005889892578125,
"learning_rate": 1.2911873439507765e-07,
"loss": 0.0434,
"reward": 1.8965216875076294,
"reward_std": 0.5311101675033569,
"rewards/": 6.33975076675415,
"rewards/math_compute_score": 0.785714328289032,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 1695.857177734375,
"epoch": 0.7756563245823389,
"grad_norm": 0.2459311422515735,
"kl": 0.005828857421875,
"learning_rate": 1.265350990708417e-07,
"loss": 0.0237,
"reward": 1.4831101894378662,
"reward_std": 0.5591450333595276,
"rewards/": 6.653645992279053,
"rewards/math_compute_score": 0.190476194024086,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 1460.8333740234375,
"epoch": 0.7780429594272077,
"grad_norm": 0.27149132930004455,
"kl": 0.006072998046875,
"learning_rate": 1.2397382495676873e-07,
"loss": 0.0584,
"reward": 1.9678572416305542,
"reward_std": 0.5532270073890686,
"rewards/": 7.077381134033203,
"rewards/math_compute_score": 0.6904761791229248,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 1638.9285888671875,
"epoch": 0.7804295942720764,
"grad_norm": 0.25969805069075325,
"kl": 0.00555419921875,
"learning_rate": 1.214350654091413e-07,
"loss": 0.0538,
"reward": 1.7508186101913452,
"reward_std": 0.6165412068367004,
"rewards/": 6.944568634033203,
"rewards/math_compute_score": 0.4523809552192688,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 1443.452392578125,
"epoch": 0.7828162291169452,
"grad_norm": 0.24646262050218948,
"kl": 0.006866455078125,
"learning_rate": 1.1891897243618183e-07,
"loss": -0.0158,
"reward": 2.0401785373687744,
"reward_std": 0.37470337748527527,
"rewards/": 7.248512268066406,
"rewards/math_compute_score": 0.738095223903656,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 1417.7857666015625,
"epoch": 0.7852028639618138,
"grad_norm": 0.3456800505744401,
"kl": 0.0074462890625,
"learning_rate": 1.1642569668895169e-07,
"loss": 0.1143,
"reward": 1.848995566368103,
"reward_std": 0.655685305595398,
"rewards/": 7.244977951049805,
"rewards/math_compute_score": 0.5,
"step": 329
},
{
"epoch": 0.7875894988066826,
"grad_norm": 0.2836166409784225,
"learning_rate": 1.139553874523313e-07,
"loss": 0.0474,
"step": 330
},
{
"epoch": 0.7875894988066826,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1675.7380981445312,
"eval_kl": 0.0064849853515625,
"eval_loss": 0.037556298077106476,
"eval_reward": 1.8106631338596344,
"eval_reward_std": 0.45607033371925354,
"eval_rewards/": 6.7676016092300415,
"eval_rewards/math_compute_score": 0.5714285839349031,
"eval_runtime": 89.131,
"eval_samples_per_second": 0.236,
"eval_steps_per_second": 0.011,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 1613.4286499023438,
"epoch": 0.7899761336515513,
"grad_norm": 0.27172662798968367,
"kl": 0.006988525390625,
"learning_rate": 1.1150819263608097e-07,
"loss": -0.0121,
"reward": 1.7162761092185974,
"reward_std": 0.4672919511795044,
"rewards/": 7.676618576049805,
"rewards/math_compute_score": 0.226190485060215,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 1452.0238037109375,
"epoch": 0.7923627684964201,
"grad_norm": 0.29870503341388466,
"kl": 0.007415771484375,
"learning_rate": 1.090842587659851e-07,
"loss": 0.034,
"reward": 2.050297737121582,
"reward_std": 0.4812738299369812,
"rewards/": 7.299107074737549,
"rewards/math_compute_score": 0.738095223903656,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 1663.761962890625,
"epoch": 0.7947494033412887,
"grad_norm": 0.23048288658260976,
"kl": 0.005828857421875,
"learning_rate": 1.0668373097507921e-07,
"loss": 0.0221,
"reward": 1.6446057558059692,
"reward_std": 0.6130920052528381,
"rewards/": 6.889695167541504,
"rewards/math_compute_score": 0.3333333432674408,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 1342.952392578125,
"epoch": 0.7971360381861575,
"grad_norm": 0.2658342486141691,
"kl": 0.00714111328125,
"learning_rate": 1.0430675299495973e-07,
"loss": -0.0088,
"reward": 1.744512677192688,
"reward_std": 0.5626569986343384,
"rewards/": 6.246372699737549,
"rewards/math_compute_score": 0.6190476417541504,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 1467.0,
"epoch": 0.7995226730310262,
"grad_norm": 0.23546913033927908,
"kl": 0.00677490234375,
"learning_rate": 1.0195346714717812e-07,
"loss": -0.0662,
"reward": 1.8030506372451782,
"reward_std": 0.6685277223587036,
"rewards/": 6.824777126312256,
"rewards/math_compute_score": 0.5476190447807312,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 1890.9285888671875,
"epoch": 0.801909307875895,
"grad_norm": 0.24627575755994274,
"kl": 0.0064697265625,
"learning_rate": 9.962401433471984e-08,
"loss": 0.016,
"reward": 1.0757441520690918,
"reward_std": 0.5567899346351624,
"rewards/": 6.140625,
"rewards/math_compute_score": -0.190476194024086,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 1682.5,
"epoch": 0.8042959427207638,
"grad_norm": 0.2843068676706687,
"kl": 0.00732421875,
"learning_rate": 9.731853403356705e-08,
"loss": 0.0064,
"reward": 1.4380580186843872,
"reward_std": 0.3497306704521179,
"rewards/": 6.047433376312256,
"rewards/math_compute_score": 0.2857142984867096,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 1401.642822265625,
"epoch": 0.8066825775656324,
"grad_norm": 0.29907908692795154,
"kl": 0.007476806640625,
"learning_rate": 9.503716428434799e-08,
"loss": -0.0348,
"reward": 1.8360120058059692,
"reward_std": 0.4471094012260437,
"rewards/": 6.989583492279053,
"rewards/math_compute_score": 0.5476190447807312,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 1525.6190185546875,
"epoch": 0.8090692124105012,
"grad_norm": 0.28235655011179184,
"kl": 0.00653076171875,
"learning_rate": 9.27800416840715e-08,
"loss": 0.0183,
"reward": 2.1017115116119385,
"reward_std": 0.5442370176315308,
"rewards/": 7.4609375,
"rewards/math_compute_score": 0.761904776096344,
"step": 339
},
{
"epoch": 0.8114558472553699,
"grad_norm": 0.23832517340240178,
"learning_rate": 9.054730137794886e-08,
"loss": 0.014,
"step": 340
},
{
"epoch": 0.8114558472553699,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1661.107177734375,
"eval_kl": 0.00655364990234375,
"eval_loss": 0.023055095225572586,
"eval_reward": 1.863730102777481,
"eval_reward_std": 0.5041995421051979,
"eval_rewards/": 6.818650126457214,
"eval_rewards/math_compute_score": 0.6250000149011612,
"eval_runtime": 89.1847,
"eval_samples_per_second": 0.235,
"eval_steps_per_second": 0.011,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 1674.6666870117188,
"epoch": 0.8138424821002387,
"grad_norm": 0.2632361321457208,
"kl": 0.00665283203125,
"learning_rate": 8.833907705130089e-08,
"loss": 0.0161,
"reward": 1.6610864400863647,
"reward_std": 0.4744073450565338,
"rewards/": 6.8292412757873535,
"rewards/math_compute_score": 0.3690476194024086,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 1667.90478515625,
"epoch": 0.8162291169451074,
"grad_norm": 0.2531099565121514,
"kl": 0.00653076171875,
"learning_rate": 8.615550092155477e-08,
"loss": 0.0291,
"reward": 1.6218007802963257,
"reward_std": 0.5996249318122864,
"rewards/": 6.680431842803955,
"rewards/math_compute_score": 0.3571428656578064,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 1443.9761962890625,
"epoch": 0.8186157517899761,
"grad_norm": 0.3708436064315245,
"kl": 0.007537841796875,
"learning_rate": 8.399670373032663e-08,
"loss": 0.0949,
"reward": 2.0575146675109863,
"reward_std": 0.7847402691841125,
"rewards/": 7.906621932983398,
"rewards/math_compute_score": 0.5952380895614624,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 1706.9285888671875,
"epoch": 0.8210023866348448,
"grad_norm": 0.2377499384119013,
"kl": 0.006683349609375,
"learning_rate": 8.186281473559381e-08,
"loss": 0.0243,
"reward": 2.0348215103149414,
"reward_std": 0.32591211795806885,
"rewards/": 7.316964626312256,
"rewards/math_compute_score": 0.7142857313156128,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 1619.357177734375,
"epoch": 0.8233890214797136,
"grad_norm": 0.26691977510844433,
"kl": 0.00634765625,
"learning_rate": 7.97539617039552e-08,
"loss": 0.0268,
"reward": 1.8893601894378662,
"reward_std": 0.6046797633171082,
"rewards/": 7.065848350524902,
"rewards/math_compute_score": 0.5952380895614624,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 1332.452392578125,
"epoch": 0.8257756563245824,
"grad_norm": 0.26196875577278594,
"kl": 0.006988525390625,
"learning_rate": 7.767027090298206e-08,
"loss": 0.0965,
"reward": 2.3058035373687744,
"reward_std": 0.30486685037612915,
"rewards/": 7.909970283508301,
"rewards/math_compute_score": 0.9047619104385376,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 1640.8809814453125,
"epoch": 0.8281622911694511,
"grad_norm": 0.3180196339273012,
"kl": 0.00750732421875,
"learning_rate": 7.561186709365652e-08,
"loss": 0.0582,
"reward": 1.2651599645614624,
"reward_std": 0.5785154700279236,
"rewards/": 6.4210381507873535,
"rewards/math_compute_score": -0.02380952425301075,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 1549.4285888671875,
"epoch": 0.8305489260143198,
"grad_norm": 0.23148361007108187,
"kl": 0.00537109375,
"learning_rate": 7.357887352290227e-08,
"loss": -0.0128,
"reward": 1.6349704265594482,
"reward_std": 0.49700257182121277,
"rewards/": 6.936756134033203,
"rewards/math_compute_score": 0.3095238208770752,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 1679.8095703125,
"epoch": 0.8329355608591885,
"grad_norm": 0.25788580600104904,
"kl": 0.006622314453125,
"learning_rate": 7.157141191620548e-08,
"loss": 0.0082,
"reward": 1.3982887268066406,
"reward_std": 0.604521632194519,
"rewards/": 6.42001485824585,
"rewards/math_compute_score": 0.1428571492433548,
"step": 349
},
{
"epoch": 0.8353221957040573,
"grad_norm": 0.30282291813716555,
"learning_rate": 6.958960247032513e-08,
"loss": 0.0621,
"step": 350
},
{
"epoch": 0.8353221957040573,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1671.3452758789062,
"eval_kl": 0.006561279296875,
"eval_loss": 0.03596196323633194,
"eval_reward": 1.8592354953289032,
"eval_reward_std": 0.509850487112999,
"eval_rewards/": 6.89141571521759,
"eval_rewards/math_compute_score": 0.6011904925107956,
"eval_runtime": 91.8744,
"eval_samples_per_second": 0.229,
"eval_steps_per_second": 0.011,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 1468.9166870117188,
"epoch": 0.837708830548926,
"grad_norm": 0.253510225674823,
"kl": 0.00677490234375,
"learning_rate": 6.763356384609809e-08,
"loss": 0.0293,
"reward": 1.8625745177268982,
"reward_std": 0.5618998408317566,
"rewards/": 7.3128721714019775,
"rewards/math_compute_score": 0.5000000149011612,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 1764.8095703125,
"epoch": 0.8400954653937948,
"grad_norm": 0.24138171322547658,
"kl": 0.005584716796875,
"learning_rate": 6.570341316133272e-08,
"loss": 0.0248,
"reward": 1.5279762744903564,
"reward_std": 0.5388675928115845,
"rewards/": 6.782738208770752,
"rewards/math_compute_score": 0.2142857164144516,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 1326.547607421875,
"epoch": 0.8424821002386634,
"grad_norm": 0.2826923722060457,
"kl": 0.00726318359375,
"learning_rate": 6.379926598379725e-08,
"loss": 0.0152,
"reward": 2.1721725463867188,
"reward_std": 0.3049697279930115,
"rewards/": 7.718006134033203,
"rewards/math_compute_score": 0.785714328289032,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 1451.4285888671875,
"epoch": 0.8448687350835322,
"grad_norm": 0.3179144298199542,
"kl": 0.006561279296875,
"learning_rate": 6.192123632429985e-08,
"loss": 0.0487,
"reward": 1.677864670753479,
"reward_std": 0.46759727597236633,
"rewards/": 7.151227951049805,
"rewards/math_compute_score": 0.3095238208770752,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 1683.1190185546875,
"epoch": 0.847255369928401,
"grad_norm": 0.29414085786077687,
"kl": 0.006103515625,
"learning_rate": 6.006943662986275e-08,
"loss": 0.0241,
"reward": 1.278906226158142,
"reward_std": 0.5312941670417786,
"rewards/": 6.680245876312256,
"rewards/math_compute_score": -0.0714285746216774,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 1503.5,
"epoch": 0.8496420047732697,
"grad_norm": 0.3026402934237509,
"kl": 0.0067138671875,
"learning_rate": 5.824397777698858e-08,
"loss": 0.1011,
"reward": 1.7272508144378662,
"reward_std": 0.4776250720024109,
"rewards/": 7.017206192016602,
"rewards/math_compute_score": 0.4047619104385376,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 1564.90478515625,
"epoch": 0.8520286396181385,
"grad_norm": 0.2936342545079758,
"kl": 0.00634765625,
"learning_rate": 5.644496906502233e-08,
"loss": 0.0701,
"reward": 1.853273868560791,
"reward_std": 0.5276380181312561,
"rewards/": 7.171131134033203,
"rewards/math_compute_score": 0.523809552192688,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 1197.0,
"epoch": 0.8544152744630071,
"grad_norm": 0.30773278900467704,
"kl": 0.00738525390625,
"learning_rate": 5.4672518209607e-08,
"loss": -0.0002,
"reward": 1.991220235824585,
"reward_std": 0.5516120195388794,
"rewards/": 6.622767925262451,
"rewards/math_compute_score": 0.8333333730697632,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 1567.166748046875,
"epoch": 0.8568019093078759,
"grad_norm": 0.2473090817946399,
"kl": 0.0057373046875,
"learning_rate": 5.292673133623371e-08,
"loss": -0.0307,
"reward": 1.5518603324890137,
"reward_std": 0.48620352149009705,
"rewards/": 6.711681842803955,
"rewards/math_compute_score": 0.261904776096344,
"step": 359
},
{
"epoch": 0.8591885441527446,
"grad_norm": 0.23273491855228218,
"learning_rate": 5.1207712973887876e-08,
"loss": 0.0191,
"step": 360
},
{
"epoch": 0.8591885441527446,
"eval_clip_ratio": 0.0,
"eval_completion_length": 1646.3214721679688,
"eval_kl": 0.00658416748046875,
"eval_loss": 0.017793310806155205,
"eval_reward": 1.8062395751476288,
"eval_reward_std": 0.5819119140505791,
"eval_rewards/": 6.840721607208252,
"eval_rewards/math_compute_score": 0.5476190708577633,
"eval_runtime": 89.4518,
"eval_samples_per_second": 0.235,
"eval_steps_per_second": 0.011,
"step": 360
}
],
"logging_steps": 1.0,
"max_steps": 419,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 40,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 14,
"trial_name": null,
"trial_params": null
}