| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.8591885441527446, | |
| "eval_steps": 10, | |
| "global_step": 360, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1852.119140625, | |
| "epoch": 0.002386634844868735, | |
| "grad_norm": 0.24904002787956508, | |
| "kl": 0.0, | |
| "learning_rate": 7.692307692307692e-08, | |
| "loss": 0.0859, | |
| "reward": 0.9985864162445068, | |
| "reward_std": 0.8023478388786316, | |
| "rewards/": 5.9453125, | |
| "rewards/math_compute_score": -0.2380952388048172, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1857.7381591796875, | |
| "epoch": 0.00477326968973747, | |
| "grad_norm": 0.24734786328196506, | |
| "kl": 0.0, | |
| "learning_rate": 1.5384615384615385e-07, | |
| "loss": 0.0801, | |
| "reward": 1.084356427192688, | |
| "reward_std": 0.8982762098312378, | |
| "rewards/": 5.802734375, | |
| "rewards/math_compute_score": -0.095238097012043, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1849.666748046875, | |
| "epoch": 0.007159904534606206, | |
| "grad_norm": 0.2778667928712439, | |
| "kl": 9.202957153320312e-05, | |
| "learning_rate": 2.3076923076923078e-07, | |
| "loss": 0.0626, | |
| "reward": 1.2749255895614624, | |
| "reward_std": 1.0164073705673218, | |
| "rewards/": 5.803199291229248, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1977.166748046875, | |
| "epoch": 0.00954653937947494, | |
| "grad_norm": 0.21684956038503114, | |
| "kl": 6.771087646484375e-05, | |
| "learning_rate": 3.076923076923077e-07, | |
| "loss": 0.021, | |
| "reward": 0.7873699069023132, | |
| "reward_std": 0.7230538725852966, | |
| "rewards/": 6.032087326049805, | |
| "rewards/math_compute_score": -0.523809552192688, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1939.40478515625, | |
| "epoch": 0.011933174224343675, | |
| "grad_norm": 0.24259770313581716, | |
| "kl": 9.012222290039062e-05, | |
| "learning_rate": 3.8461538461538463e-07, | |
| "loss": 0.0349, | |
| "reward": 1.0736374855041504, | |
| "reward_std": 0.8964518904685974, | |
| "rewards/": 6.130092144012451, | |
| "rewards/math_compute_score": -0.190476194024086, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2030.21435546875, | |
| "epoch": 0.014319809069212411, | |
| "grad_norm": 0.23532745486358006, | |
| "kl": 0.000102996826171875, | |
| "learning_rate": 4.6153846153846156e-07, | |
| "loss": 0.0077, | |
| "reward": 0.669866144657135, | |
| "reward_std": 0.6640447378158569, | |
| "rewards/": 6.015996932983398, | |
| "rewards/math_compute_score": -0.6666666865348816, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1837.047607421875, | |
| "epoch": 0.016706443914081145, | |
| "grad_norm": 0.24695210299782985, | |
| "kl": 7.62939453125e-05, | |
| "learning_rate": 5.384615384615384e-07, | |
| "loss": 0.0643, | |
| "reward": 1.1588542461395264, | |
| "reward_std": 0.8526113629341125, | |
| "rewards/": 5.794270992279053, | |
| "rewards/math_compute_score": 0.0, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1710.7857666015625, | |
| "epoch": 0.01909307875894988, | |
| "grad_norm": 0.2635039041467129, | |
| "kl": 0.00010061264038085938, | |
| "learning_rate": 6.153846153846154e-07, | |
| "loss": 0.0352, | |
| "reward": 1.4391371011734009, | |
| "reward_std": 0.6852283477783203, | |
| "rewards/": 6.624256134033203, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1951.8095703125, | |
| "epoch": 0.021479713603818614, | |
| "grad_norm": 0.22619335380819766, | |
| "kl": 7.82012939453125e-05, | |
| "learning_rate": 6.923076923076922e-07, | |
| "loss": 0.0341, | |
| "reward": 0.9593006372451782, | |
| "reward_std": 0.7629837989807129, | |
| "rewards/": 6.129836559295654, | |
| "rewards/math_compute_score": -0.3333333432674408, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.02386634844868735, | |
| "grad_norm": 0.2507566093873831, | |
| "learning_rate": 7.692307692307693e-07, | |
| "loss": 0.026, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02386634844868735, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1909.1607666015625, | |
| "eval_kl": 9.047985076904297e-05, | |
| "eval_loss": 0.034137628972530365, | |
| "eval_reward": 1.143368422985077, | |
| "eval_reward_std": 0.7167749404907227, | |
| "eval_rewards/": 6.193032503128052, | |
| "eval_rewards/math_compute_score": -0.11904762079939246, | |
| "eval_runtime": 94.5877, | |
| "eval_samples_per_second": 0.222, | |
| "eval_steps_per_second": 0.011, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1677.8928833007812, | |
| "epoch": 0.026252983293556086, | |
| "grad_norm": 0.2771089455944722, | |
| "kl": 0.00010251998901367188, | |
| "learning_rate": 8.461538461538461e-07, | |
| "loss": 0.0246, | |
| "reward": 1.308068335056305, | |
| "reward_std": 0.6968488693237305, | |
| "rewards/": 5.778436660766602, | |
| "rewards/math_compute_score": 0.19047619588673115, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1728.4285888671875, | |
| "epoch": 0.028639618138424822, | |
| "grad_norm": 0.27092892547921155, | |
| "kl": 0.0001068115234375, | |
| "learning_rate": 9.230769230769231e-07, | |
| "loss": 0.0862, | |
| "reward": 1.2834821939468384, | |
| "reward_std": 0.9967786073684692, | |
| "rewards/": 5.750744342803955, | |
| "rewards/math_compute_score": 0.1666666716337204, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1984.619140625, | |
| "epoch": 0.031026252983293555, | |
| "grad_norm": 0.22856130082794326, | |
| "kl": 9.012222290039062e-05, | |
| "learning_rate": 1e-06, | |
| "loss": 0.038, | |
| "reward": 0.910714328289032, | |
| "reward_std": 0.7129672169685364, | |
| "rewards/": 6.458333492279053, | |
| "rewards/math_compute_score": -0.4761904776096344, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1827.3095703125, | |
| "epoch": 0.03341288782816229, | |
| "grad_norm": 0.2724826324747265, | |
| "kl": 0.00011968612670898438, | |
| "learning_rate": 9.99985031250522e-07, | |
| "loss": 0.014, | |
| "reward": 1.2249256372451782, | |
| "reward_std": 0.595586895942688, | |
| "rewards/": 6.2198662757873535, | |
| "rewards/math_compute_score": -0.02380952425301075, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1637.6905517578125, | |
| "epoch": 0.03579952267303103, | |
| "grad_norm": 0.27133525373007367, | |
| "kl": 9.775161743164062e-05, | |
| "learning_rate": 9.999401258983425e-07, | |
| "loss": 0.0523, | |
| "reward": 1.1956148147583008, | |
| "reward_std": 0.6316924691200256, | |
| "rewards/": 5.692359447479248, | |
| "rewards/math_compute_score": 0.0714285746216774, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1857.0, | |
| "epoch": 0.03818615751789976, | |
| "grad_norm": 0.25395778947044934, | |
| "kl": 9.822845458984375e-05, | |
| "learning_rate": 9.998652866321687e-07, | |
| "loss": 0.0322, | |
| "reward": 1.1026042699813843, | |
| "reward_std": 0.5841531157493591, | |
| "rewards/": 6.8463544845581055, | |
| "rewards/math_compute_score": -0.3333333432674408, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1820.2857666015625, | |
| "epoch": 0.0405727923627685, | |
| "grad_norm": 0.27104243767092423, | |
| "kl": 0.00010204315185546875, | |
| "learning_rate": 9.997605179330017e-07, | |
| "loss": 0.0587, | |
| "reward": 1.2473958730697632, | |
| "reward_std": 0.9411390423774719, | |
| "rewards/": 6.332217216491699, | |
| "rewards/math_compute_score": -0.02380952425301075, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2032.6905517578125, | |
| "epoch": 0.04295942720763723, | |
| "grad_norm": 0.2425790646876099, | |
| "kl": 9.679794311523438e-05, | |
| "learning_rate": 9.996258260738674e-07, | |
| "loss": 0.0062, | |
| "reward": 1.0361607074737549, | |
| "reward_std": 0.7374575734138489, | |
| "rewards/": 6.704613208770752, | |
| "rewards/math_compute_score": -0.380952388048172, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1860.5238037109375, | |
| "epoch": 0.045346062052505964, | |
| "grad_norm": 0.2526155285730889, | |
| "kl": 9.393692016601562e-05, | |
| "learning_rate": 9.994612191194405e-07, | |
| "loss": 0.0504, | |
| "reward": 1.2767857313156128, | |
| "reward_std": 0.6392523646354675, | |
| "rewards/": 6.19345235824585, | |
| "rewards/math_compute_score": 0.0476190485060215, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.0477326968973747, | |
| "grad_norm": 0.2019998878404815, | |
| "learning_rate": 9.992667069255618e-07, | |
| "loss": 0.0563, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0477326968973747, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1916.7678833007812, | |
| "eval_kl": 8.654594421386719e-05, | |
| "eval_loss": 0.045527394860982895, | |
| "eval_reward": 1.1353678554296494, | |
| "eval_reward_std": 0.7787726670503616, | |
| "eval_rewards/": 6.105410695075989, | |
| "eval_rewards/math_compute_score": -0.10714286100119352, | |
| "eval_runtime": 94.5472, | |
| "eval_samples_per_second": 0.222, | |
| "eval_steps_per_second": 0.011, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1885.857177734375, | |
| "epoch": 0.050119331742243436, | |
| "grad_norm": 0.23793932188276107, | |
| "kl": 7.82012939453125e-05, | |
| "learning_rate": 9.990423011386488e-07, | |
| "loss": 0.0364, | |
| "reward": 1.1678432822227478, | |
| "reward_std": 0.7990120947360992, | |
| "rewards/": 6.220168590545654, | |
| "rewards/math_compute_score": -0.09523809887468815, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1924.619140625, | |
| "epoch": 0.05250596658711217, | |
| "grad_norm": 0.24652779541561778, | |
| "kl": 9.965896606445312e-05, | |
| "learning_rate": 9.987880151949975e-07, | |
| "loss": 0.0332, | |
| "reward": 0.9929687976837158, | |
| "reward_std": 0.6396169066429138, | |
| "rewards/": 5.917224884033203, | |
| "rewards/math_compute_score": -0.2380952388048172, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1572.8095703125, | |
| "epoch": 0.05489260143198091, | |
| "grad_norm": 0.2535111310244703, | |
| "kl": 7.486343383789062e-05, | |
| "learning_rate": 9.985038643199778e-07, | |
| "loss": 0.0609, | |
| "reward": 1.378557562828064, | |
| "reward_std": 0.7171154618263245, | |
| "rewards/": 4.892787456512451, | |
| "rewards/math_compute_score": 0.5, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1634.3095703125, | |
| "epoch": 0.057279236276849645, | |
| "grad_norm": 0.28708136371156134, | |
| "kl": 9.822845458984375e-05, | |
| "learning_rate": 9.981898655271234e-07, | |
| "loss": 0.0954, | |
| "reward": 1.303076148033142, | |
| "reward_std": 0.84552401304245, | |
| "rewards/": 5.658237934112549, | |
| "rewards/math_compute_score": 0.2142857164144516, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1806.9761962890625, | |
| "epoch": 0.059665871121718374, | |
| "grad_norm": 0.23136332868998175, | |
| "kl": 8.58306884765625e-05, | |
| "learning_rate": 9.978460376171112e-07, | |
| "loss": 0.0437, | |
| "reward": 1.099237322807312, | |
| "reward_std": 0.5548410415649414, | |
| "rewards/": 6.258091449737549, | |
| "rewards/math_compute_score": -0.190476194024086, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1635.0, | |
| "epoch": 0.06205250596658711, | |
| "grad_norm": 0.27536984991640817, | |
| "kl": 0.00011491775512695312, | |
| "learning_rate": 9.974724011766361e-07, | |
| "loss": 0.0625, | |
| "reward": 1.6150113344192505, | |
| "reward_std": 0.5635201930999756, | |
| "rewards/": 6.456008434295654, | |
| "rewards/math_compute_score": 0.4047619104385376, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1988.5, | |
| "epoch": 0.06443914081145585, | |
| "grad_norm": 0.22513198771984447, | |
| "kl": 8.0108642578125e-05, | |
| "learning_rate": 9.970689785771798e-07, | |
| "loss": 0.0169, | |
| "reward": 1.2694941759109497, | |
| "reward_std": 0.6696727275848389, | |
| "rewards/": 6.442708492279053, | |
| "rewards/math_compute_score": -0.02380952425301075, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1631.1429443359375, | |
| "epoch": 0.06682577565632458, | |
| "grad_norm": 0.2269622871051312, | |
| "kl": 7.2479248046875e-05, | |
| "learning_rate": 9.96635793973669e-07, | |
| "loss": 0.0095, | |
| "reward": 1.7527531385421753, | |
| "reward_std": 0.3711332380771637, | |
| "rewards/": 6.76376485824585, | |
| "rewards/math_compute_score": 0.5, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1927.3095703125, | |
| "epoch": 0.06921241050119331, | |
| "grad_norm": 0.2768313841282757, | |
| "kl": 0.00011539459228515625, | |
| "learning_rate": 9.961728733030316e-07, | |
| "loss": 0.0523, | |
| "reward": 1.046147346496582, | |
| "reward_std": 0.8631386756896973, | |
| "rewards/": 5.992640972137451, | |
| "rewards/math_compute_score": -0.190476194024086, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.07159904534606205, | |
| "grad_norm": 0.22336747996927972, | |
| "learning_rate": 9.956802442826415e-07, | |
| "loss": -0.0099, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07159904534606205, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1907.7559814453125, | |
| "eval_kl": 8.702278137207031e-05, | |
| "eval_loss": 0.05547888204455376, | |
| "eval_reward": 1.119970753788948, | |
| "eval_reward_std": 0.7313214838504791, | |
| "eval_rewards/": 6.195091724395752, | |
| "eval_rewards/math_compute_score": -0.14880952518433332, | |
| "eval_runtime": 94.0737, | |
| "eval_samples_per_second": 0.223, | |
| "eval_steps_per_second": 0.011, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1920.3095703125, | |
| "epoch": 0.07398568019093078, | |
| "grad_norm": 0.23160944462486585, | |
| "kl": 7.987022399902344e-05, | |
| "learning_rate": 9.951579364086603e-07, | |
| "loss": 0.0088, | |
| "reward": 1.1973958909511566, | |
| "reward_std": 0.7134246528148651, | |
| "rewards/": 6.367931842803955, | |
| "rewards/math_compute_score": -0.095238097012043, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1838.6429443359375, | |
| "epoch": 0.07637231503579953, | |
| "grad_norm": 0.24487657247172945, | |
| "kl": 9.202957153320312e-05, | |
| "learning_rate": 9.946059809542706e-07, | |
| "loss": 0.0476, | |
| "reward": 1.3777530193328857, | |
| "reward_std": 0.5670939683914185, | |
| "rewards/": 6.222098350524902, | |
| "rewards/math_compute_score": 0.1666666716337204, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1876.8095703125, | |
| "epoch": 0.07875894988066826, | |
| "grad_norm": 0.2506016708667217, | |
| "kl": 7.772445678710938e-05, | |
| "learning_rate": 9.940244109678041e-07, | |
| "loss": 0.0588, | |
| "reward": 1.0940476655960083, | |
| "reward_std": 0.597551167011261, | |
| "rewards/": 6.803571701049805, | |
| "rewards/math_compute_score": -0.3333333432674408, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1674.4761962890625, | |
| "epoch": 0.081145584725537, | |
| "grad_norm": 0.25999785701346345, | |
| "kl": 7.772445678710938e-05, | |
| "learning_rate": 9.93413261270763e-07, | |
| "loss": 0.0355, | |
| "reward": 1.716294765472412, | |
| "reward_std": 0.4909428358078003, | |
| "rewards/": 6.48623514175415, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1891.5, | |
| "epoch": 0.08353221957040573, | |
| "grad_norm": 0.26237200501370733, | |
| "kl": 9.72747802734375e-05, | |
| "learning_rate": 9.927725684557339e-07, | |
| "loss": 0.0563, | |
| "reward": 1.2531064748764038, | |
| "reward_std": 0.7479419112205505, | |
| "rewards/": 6.075056076049805, | |
| "rewards/math_compute_score": 0.0476190485060215, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1857.59521484375, | |
| "epoch": 0.08591885441527446, | |
| "grad_norm": 0.23853807456535067, | |
| "kl": 6.198883056640625e-05, | |
| "learning_rate": 9.921023708841973e-07, | |
| "loss": 0.0385, | |
| "reward": 1.121354103088379, | |
| "reward_std": 0.6391122341156006, | |
| "rewards/": 6.368675708770752, | |
| "rewards/math_compute_score": -0.190476194024086, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1826.8333740234375, | |
| "epoch": 0.0883054892601432, | |
| "grad_norm": 0.2518634206321752, | |
| "kl": 8.630752563476562e-05, | |
| "learning_rate": 9.914027086842322e-07, | |
| "loss": 0.0281, | |
| "reward": 1.080822229385376, | |
| "reward_std": 0.7901343703269958, | |
| "rewards/": 6.166015625, | |
| "rewards/math_compute_score": -0.190476194024086, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1974.761962890625, | |
| "epoch": 0.09069212410501193, | |
| "grad_norm": 0.21874504590315716, | |
| "kl": 8.106231689453125e-05, | |
| "learning_rate": 9.906736237481108e-07, | |
| "loss": 0.0347, | |
| "reward": 0.7355794310569763, | |
| "reward_std": 0.943155825138092, | |
| "rewards/": 5.01123046875, | |
| "rewards/math_compute_score": -0.3333333432674408, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1853.9761962890625, | |
| "epoch": 0.09307875894988067, | |
| "grad_norm": 0.2636078943840054, | |
| "kl": 0.00010204315185546875, | |
| "learning_rate": 9.899151597297922e-07, | |
| "loss": 0.0061, | |
| "reward": 0.7527158260345459, | |
| "reward_std": 0.5094756484031677, | |
| "rewards/": 5.76357889175415, | |
| "rewards/math_compute_score": -0.5, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.0954653937947494, | |
| "grad_norm": 0.22442688552857057, | |
| "learning_rate": 9.891273620423082e-07, | |
| "loss": 0.0218, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0954653937947494, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1899.7857666015625, | |
| "eval_kl": 7.843971252441406e-05, | |
| "eval_loss": 0.040552277117967606, | |
| "eval_reward": 1.2572033554315567, | |
| "eval_reward_std": 0.8417278081178665, | |
| "eval_rewards/": 6.166969180107117, | |
| "eval_rewards/math_compute_score": 0.02976190485060215, | |
| "eval_runtime": 94.3574, | |
| "eval_samples_per_second": 0.223, | |
| "eval_steps_per_second": 0.011, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1945.3333129882812, | |
| "epoch": 0.09785202863961814, | |
| "grad_norm": 0.25352200487817333, | |
| "kl": 7.82012939453125e-05, | |
| "learning_rate": 9.883102778550434e-07, | |
| "loss": 0.0225, | |
| "reward": 1.16990327835083, | |
| "reward_std": 0.6465564370155334, | |
| "rewards/": 6.659040212631226, | |
| "rewards/math_compute_score": -0.2023809514939785, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1662.452392578125, | |
| "epoch": 0.10023866348448687, | |
| "grad_norm": 0.29366924279276885, | |
| "kl": 6.151199340820312e-05, | |
| "learning_rate": 9.874639560909118e-07, | |
| "loss": 0.0167, | |
| "reward": 1.2738094329833984, | |
| "reward_std": 0.7358887791633606, | |
| "rewards/": 5.702381134033203, | |
| "rewards/math_compute_score": 0.1666666716337204, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1590.8333740234375, | |
| "epoch": 0.1026252983293556, | |
| "grad_norm": 0.2978512257146081, | |
| "kl": 9.72747802734375e-05, | |
| "learning_rate": 9.865884474234275e-07, | |
| "loss": 0.068, | |
| "reward": 1.3785715103149414, | |
| "reward_std": 0.6778793334960938, | |
| "rewards/": 5.75, | |
| "rewards/math_compute_score": 0.2857142984867096, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1981.261962890625, | |
| "epoch": 0.10501193317422435, | |
| "grad_norm": 0.2557952927437338, | |
| "kl": 0.0001068115234375, | |
| "learning_rate": 9.856838042736696e-07, | |
| "loss": 0.015, | |
| "reward": 0.920479953289032, | |
| "reward_std": 0.9247375726699829, | |
| "rewards/": 6.126209259033203, | |
| "rewards/math_compute_score": -0.380952388048172, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1779.261962890625, | |
| "epoch": 0.10739856801909307, | |
| "grad_norm": 0.24475320888510424, | |
| "kl": 5.7220458984375e-05, | |
| "learning_rate": 9.847500808071456e-07, | |
| "loss": 0.029, | |
| "reward": 1.5369793176651, | |
| "reward_std": 0.4876500964164734, | |
| "rewards/": 6.161086559295654, | |
| "rewards/math_compute_score": 0.380952388048172, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1841.90478515625, | |
| "epoch": 0.10978520286396182, | |
| "grad_norm": 0.27708975756974613, | |
| "kl": 0.00010347366333007812, | |
| "learning_rate": 9.837873329305457e-07, | |
| "loss": 0.0135, | |
| "reward": 0.7778274416923523, | |
| "reward_std": 0.5871782898902893, | |
| "rewards/": 5.60342264175415, | |
| "rewards/math_compute_score": -0.4285714328289032, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1795.119140625, | |
| "epoch": 0.11217183770883055, | |
| "grad_norm": 0.24633285234690444, | |
| "kl": 7.915496826171875e-05, | |
| "learning_rate": 9.82795618288397e-07, | |
| "loss": 0.0583, | |
| "reward": 1.1924666166305542, | |
| "reward_std": 0.8271605372428894, | |
| "rewards/": 5.390903949737549, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1873.6429443359375, | |
| "epoch": 0.11455847255369929, | |
| "grad_norm": 0.23871306735217734, | |
| "kl": 8.726119995117188e-05, | |
| "learning_rate": 9.817749962596114e-07, | |
| "loss": 0.0365, | |
| "reward": 1.1016182899475098, | |
| "reward_std": 0.7240482568740845, | |
| "rewards/": 6.650949001312256, | |
| "rewards/math_compute_score": -0.2857142984867096, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1836.5238037109375, | |
| "epoch": 0.11694510739856802, | |
| "grad_norm": 0.2602991840828808, | |
| "kl": 7.200241088867188e-05, | |
| "learning_rate": 9.807255279539312e-07, | |
| "loss": 0.0571, | |
| "reward": 1.3672620058059692, | |
| "reward_std": 0.6877846121788025, | |
| "rewards/": 6.550595283508301, | |
| "rewards/math_compute_score": 0.0714285746216774, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.11933174224343675, | |
| "grad_norm": 0.257239746248728, | |
| "learning_rate": 9.796472762082685e-07, | |
| "loss": 0.1056, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.11933174224343675, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1877.7976379394531, | |
| "eval_kl": 8.618831634521484e-05, | |
| "eval_loss": 0.0589974969625473, | |
| "eval_reward": 1.3616862893104553, | |
| "eval_reward_std": 0.7793268263339996, | |
| "eval_rewards/": 6.427478790283203, | |
| "eval_rewards/math_compute_score": 0.095238097012043, | |
| "eval_runtime": 96.4379, | |
| "eval_samples_per_second": 0.218, | |
| "eval_steps_per_second": 0.01, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1820.0833740234375, | |
| "epoch": 0.12171837708830549, | |
| "grad_norm": 0.2573216950537411, | |
| "kl": 9.870529174804688e-05, | |
| "learning_rate": 9.785403055829448e-07, | |
| "loss": 0.0382, | |
| "reward": 1.4516844153404236, | |
| "reward_std": 0.7663153111934662, | |
| "rewards/": 6.115564346313477, | |
| "rewards/math_compute_score": 0.2857142798602581, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1893.261962890625, | |
| "epoch": 0.12410501193317422, | |
| "grad_norm": 0.25066924022690434, | |
| "kl": 0.00010347366333007812, | |
| "learning_rate": 9.77404682357824e-07, | |
| "loss": 0.0493, | |
| "reward": 1.5894160270690918, | |
| "reward_std": 0.8626825213432312, | |
| "rewards/": 7.185174942016602, | |
| "rewards/math_compute_score": 0.190476194024086, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1847.4285888671875, | |
| "epoch": 0.12649164677804295, | |
| "grad_norm": 0.26985384617928543, | |
| "kl": 0.000110626220703125, | |
| "learning_rate": 9.762404745283437e-07, | |
| "loss": 0.081, | |
| "reward": 0.982366144657135, | |
| "reward_std": 0.8509321808815002, | |
| "rewards/": 5.67373514175415, | |
| "rewards/math_compute_score": -0.190476194024086, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1893.666748046875, | |
| "epoch": 0.1288782816229117, | |
| "grad_norm": 0.23216642387230993, | |
| "kl": 9.775161743164062e-05, | |
| "learning_rate": 9.75047751801446e-07, | |
| "loss": 0.0562, | |
| "reward": 1.1761904954910278, | |
| "reward_std": 0.5184506773948669, | |
| "rewards/": 5.976190567016602, | |
| "rewards/math_compute_score": -0.02380952425301075, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1542.6190185546875, | |
| "epoch": 0.13126491646778043, | |
| "grad_norm": 0.2931169431343856, | |
| "kl": 0.00011014938354492188, | |
| "learning_rate": 9.738265855914012e-07, | |
| "loss": 0.0965, | |
| "reward": 1.5120384693145752, | |
| "reward_std": 0.8640336990356445, | |
| "rewards/": 5.941144943237305, | |
| "rewards/math_compute_score": 0.4047619104385376, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1727.4761962890625, | |
| "epoch": 0.13365155131264916, | |
| "grad_norm": 0.2751344219541302, | |
| "kl": 0.00011014938354492188, | |
| "learning_rate": 9.725770490155338e-07, | |
| "loss": 0.0159, | |
| "reward": 1.548958420753479, | |
| "reward_std": 0.5042334794998169, | |
| "rewards/": 6.792410850524902, | |
| "rewards/math_compute_score": 0.2380952388048172, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1785.3095703125, | |
| "epoch": 0.1360381861575179, | |
| "grad_norm": 0.24999827557080748, | |
| "kl": 0.000125885009765625, | |
| "learning_rate": 9.712992168898435e-07, | |
| "loss": 0.0367, | |
| "reward": 1.216183066368103, | |
| "reward_std": 0.8457887768745422, | |
| "rewards/": 5.985677242279053, | |
| "rewards/math_compute_score": 0.02380952425301075, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1770.2381591796875, | |
| "epoch": 0.13842482100238662, | |
| "grad_norm": 0.2367607232006906, | |
| "kl": 0.00011157989501953125, | |
| "learning_rate": 9.699931657245263e-07, | |
| "loss": 0.0687, | |
| "reward": 1.097646951675415, | |
| "reward_std": 0.6355366706848145, | |
| "rewards/": 5.012044429779053, | |
| "rewards/math_compute_score": 0.1190476194024086, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1794.21435546875, | |
| "epoch": 0.14081145584725538, | |
| "grad_norm": 0.2837311661067899, | |
| "kl": 0.00015926361083984375, | |
| "learning_rate": 9.686589737193928e-07, | |
| "loss": 0.0467, | |
| "reward": 1.027864694595337, | |
| "reward_std": 0.7552880048751831, | |
| "rewards/": 5.615513324737549, | |
| "rewards/math_compute_score": -0.1190476194024086, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.1431980906921241, | |
| "grad_norm": 0.23325912751014408, | |
| "learning_rate": 9.67296720759187e-07, | |
| "loss": 0.0344, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1431980906921241, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1855.9643249511719, | |
| "eval_kl": 0.0001424551010131836, | |
| "eval_loss": 0.04132155328989029, | |
| "eval_reward": 1.3015253245830536, | |
| "eval_reward_std": 0.6968550086021423, | |
| "eval_rewards/": 6.221912384033203, | |
| "eval_rewards/math_compute_score": 0.07142857741564512, | |
| "eval_runtime": 93.1523, | |
| "eval_samples_per_second": 0.225, | |
| "eval_steps_per_second": 0.011, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1760.2500610351562, | |
| "epoch": 0.14558472553699284, | |
| "grad_norm": 0.24769237773421138, | |
| "kl": 0.0001201629638671875, | |
| "learning_rate": 9.659064884088016e-07, | |
| "loss": 0.0936, | |
| "reward": 1.3301293551921844, | |
| "reward_std": 0.6551631987094879, | |
| "rewards/": 6.1744561195373535, | |
| "rewards/math_compute_score": 0.1190476268529892, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1992.7857666015625, | |
| "epoch": 0.14797136038186157, | |
| "grad_norm": 0.2486362317475695, | |
| "kl": 0.0001544952392578125, | |
| "learning_rate": 9.644883599083957e-07, | |
| "loss": 0.0196, | |
| "reward": 0.8310267925262451, | |
| "reward_std": 0.5494909882545471, | |
| "rewards/": 6.250371932983398, | |
| "rewards/math_compute_score": -0.523809552192688, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1843.2857666015625, | |
| "epoch": 0.15035799522673032, | |
| "grad_norm": 0.2332848729776955, | |
| "kl": 0.00014209747314453125, | |
| "learning_rate": 9.630424201684103e-07, | |
| "loss": -0.011, | |
| "reward": 1.4165923595428467, | |
| "reward_std": 0.44226470589637756, | |
| "rewards/": 6.511532783508301, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1565.09521484375, | |
| "epoch": 0.15274463007159905, | |
| "grad_norm": 0.26464093358588886, | |
| "kl": 0.000156402587890625, | |
| "learning_rate": 9.615687557644848e-07, | |
| "loss": 0.0191, | |
| "reward": 1.9476191997528076, | |
| "reward_std": 0.4103147089481354, | |
| "rewards/": 6.690476417541504, | |
| "rewards/math_compute_score": 0.761904776096344, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1935.4285888671875, | |
| "epoch": 0.15513126491646778, | |
| "grad_norm": 0.2563941042729108, | |
| "kl": 0.000148773193359375, | |
| "learning_rate": 9.600674549322716e-07, | |
| "loss": 0.0453, | |
| "reward": 1.275520920753479, | |
| "reward_std": 0.7737724781036377, | |
| "rewards/": 6.758556842803955, | |
| "rewards/math_compute_score": -0.095238097012043, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1833.5, | |
| "epoch": 0.1575178997613365, | |
| "grad_norm": 0.2318537716691269, | |
| "kl": 0.00012302398681640625, | |
| "learning_rate": 9.585386075621552e-07, | |
| "loss": 0.0385, | |
| "reward": 1.5590215921401978, | |
| "reward_std": 0.6652101278305054, | |
| "rewards/": 6.461774826049805, | |
| "rewards/math_compute_score": 0.3333333432674408, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1752.4285888671875, | |
| "epoch": 0.15990453460620524, | |
| "grad_norm": 0.27198514000671786, | |
| "kl": 0.00019741058349609375, | |
| "learning_rate": 9.569823051938689e-07, | |
| "loss": 0.09, | |
| "reward": 1.275632381439209, | |
| "reward_std": 0.663583517074585, | |
| "rewards/": 5.901971817016602, | |
| "rewards/math_compute_score": 0.1190476194024086, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1832.6429443359375, | |
| "epoch": 0.162291169451074, | |
| "grad_norm": 0.2821458726503061, | |
| "kl": 0.0002574920654296875, | |
| "learning_rate": 9.553986410110134e-07, | |
| "loss": 0.0101, | |
| "reward": 1.6696429252624512, | |
| "reward_std": 0.5702826380729675, | |
| "rewards/": 6.824404716491699, | |
| "rewards/math_compute_score": 0.380952388048172, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2015.90478515625, | |
| "epoch": 0.16467780429594273, | |
| "grad_norm": 0.2419566650414704, | |
| "kl": 0.00018024444580078125, | |
| "learning_rate": 9.537877098354784e-07, | |
| "loss": 0.0112, | |
| "reward": 0.877566933631897, | |
| "reward_std": 0.7397137880325317, | |
| "rewards/": 6.19735860824585, | |
| "rewards/math_compute_score": -0.4523809552192688, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.16706443914081145, | |
| "grad_norm": 0.22253088780698607, | |
| "learning_rate": 9.52149608121765e-07, | |
| "loss": 0.021, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.16706443914081145, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1840.1190795898438, | |
| "eval_kl": 0.00020503997802734375, | |
| "eval_loss": 0.03286560997366905, | |
| "eval_reward": 1.40643610060215, | |
| "eval_reward_std": 0.6496933400630951, | |
| "eval_rewards/": 6.246465802192688, | |
| "eval_rewards/math_compute_score": 0.1964285671710968, | |
| "eval_runtime": 92.7357, | |
| "eval_samples_per_second": 0.226, | |
| "eval_steps_per_second": 0.011, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1853.15478515625, | |
| "epoch": 0.16945107398568018, | |
| "grad_norm": 0.20266438209975887, | |
| "kl": 0.00015115737915039062, | |
| "learning_rate": 9.504844339512094e-07, | |
| "loss": -0.0027, | |
| "reward": 1.1317429542541504, | |
| "reward_std": 0.7229768335819244, | |
| "rewards/": 6.230143308639526, | |
| "rewards/math_compute_score": -0.1428571455180645, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1757.857177734375, | |
| "epoch": 0.1718377088305489, | |
| "grad_norm": 0.2748053071046328, | |
| "kl": 0.000278472900390625, | |
| "learning_rate": 9.487922870261121e-07, | |
| "loss": 0.0455, | |
| "reward": 1.6033483743667603, | |
| "reward_std": 0.7141416668891907, | |
| "rewards/": 6.492931842803955, | |
| "rewards/math_compute_score": 0.380952388048172, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1690.3809814453125, | |
| "epoch": 0.17422434367541767, | |
| "grad_norm": 0.2990083388684613, | |
| "kl": 0.00022125244140625, | |
| "learning_rate": 9.470732686637664e-07, | |
| "loss": 0.0544, | |
| "reward": 1.4808036088943481, | |
| "reward_std": 0.7631458044052124, | |
| "rewards/": 6.546875, | |
| "rewards/math_compute_score": 0.2142857164144516, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1857.666748046875, | |
| "epoch": 0.1766109785202864, | |
| "grad_norm": 0.23969410631706858, | |
| "kl": 0.0002002716064453125, | |
| "learning_rate": 9.45327481790393e-07, | |
| "loss": 0.0015, | |
| "reward": 1.123772382736206, | |
| "reward_std": 0.5604047179222107, | |
| "rewards/": 6.476004600524902, | |
| "rewards/math_compute_score": -0.2142857164144516, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1976.3095703125, | |
| "epoch": 0.17899761336515513, | |
| "grad_norm": 0.24073302005555794, | |
| "kl": 0.0002498626708984375, | |
| "learning_rate": 9.435550309349776e-07, | |
| "loss": 0.0242, | |
| "reward": 1.314062476158142, | |
| "reward_std": 0.7065877914428711, | |
| "rewards/": 6.95126485824585, | |
| "rewards/math_compute_score": -0.095238097012043, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1773.8095703125, | |
| "epoch": 0.18138424821002386, | |
| "grad_norm": 0.29878792702659795, | |
| "kl": 0.0004253387451171875, | |
| "learning_rate": 9.417560222230114e-07, | |
| "loss": 0.0681, | |
| "reward": 1.7110120058059692, | |
| "reward_std": 0.5763629674911499, | |
| "rewards/": 7.221726417541504, | |
| "rewards/math_compute_score": 0.3333333432674408, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1844.3095703125, | |
| "epoch": 0.18377088305489261, | |
| "grad_norm": 0.2539852824553324, | |
| "kl": 0.00025177001953125, | |
| "learning_rate": 9.399305633701372e-07, | |
| "loss": 0.0414, | |
| "reward": 1.2555060386657715, | |
| "reward_std": 0.6134779453277588, | |
| "rewards/": 6.658482074737549, | |
| "rewards/math_compute_score": -0.095238097012043, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1619.6429443359375, | |
| "epoch": 0.18615751789976134, | |
| "grad_norm": 0.27715552033849417, | |
| "kl": 0.0003719329833984375, | |
| "learning_rate": 9.380787636757e-07, | |
| "loss": 0.0674, | |
| "reward": 1.380022406578064, | |
| "reward_std": 0.4771695137023926, | |
| "rewards/": 6.328683376312256, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1651.6190185546875, | |
| "epoch": 0.18854415274463007, | |
| "grad_norm": 0.2917968349033686, | |
| "kl": 0.00030517578125, | |
| "learning_rate": 9.362007340162028e-07, | |
| "loss": 0.0588, | |
| "reward": 1.4904018640518188, | |
| "reward_std": 0.649864912033081, | |
| "rewards/": 6.499628067016602, | |
| "rewards/math_compute_score": 0.2380952388048172, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.1909307875894988, | |
| "grad_norm": 0.25391040017369215, | |
| "learning_rate": 9.342965868386673e-07, | |
| "loss": 0.0279, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1909307875894988, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1795.4226684570312, | |
| "eval_kl": 0.0003237724304199219, | |
| "eval_loss": 0.04981280118227005, | |
| "eval_reward": 1.4364235252141953, | |
| "eval_reward_std": 0.6483379453420639, | |
| "eval_rewards/": 6.396403074264526, | |
| "eval_rewards/math_compute_score": 0.1964285746216774, | |
| "eval_runtime": 92.6366, | |
| "eval_samples_per_second": 0.227, | |
| "eval_steps_per_second": 0.011, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1711.952392578125, | |
| "epoch": 0.19331742243436753, | |
| "grad_norm": 0.24040818501985536, | |
| "kl": 0.000331878662109375, | |
| "learning_rate": 9.323664361539018e-07, | |
| "loss": 0.0842, | |
| "reward": 1.5535017251968384, | |
| "reward_std": 0.7349307537078857, | |
| "rewards/": 6.338937044143677, | |
| "rewards/math_compute_score": 0.3571428544819355, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1713.6190185546875, | |
| "epoch": 0.1957040572792363, | |
| "grad_norm": 0.2780469140699165, | |
| "kl": 0.00043487548828125, | |
| "learning_rate": 9.304103975296748e-07, | |
| "loss": 0.0477, | |
| "reward": 1.3992561101913452, | |
| "reward_std": 0.5194663405418396, | |
| "rewards/": 6.424851417541504, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1488.40478515625, | |
| "epoch": 0.19809069212410502, | |
| "grad_norm": 0.33565417772392336, | |
| "kl": 0.0004444122314453125, | |
| "learning_rate": 9.284285880837946e-07, | |
| "loss": 0.0969, | |
| "reward": 1.6488840579986572, | |
| "reward_std": 0.5082271099090576, | |
| "rewards/": 6.149181842803955, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1670.1905517578125, | |
| "epoch": 0.20047732696897375, | |
| "grad_norm": 0.24053030643983023, | |
| "kl": 0.00046539306640625, | |
| "learning_rate": 9.264211264770976e-07, | |
| "loss": 0.027, | |
| "reward": 1.6110121011734009, | |
| "reward_std": 0.29187697172164917, | |
| "rewards/": 6.91220235824585, | |
| "rewards/math_compute_score": 0.2857142984867096, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1544.047607421875, | |
| "epoch": 0.20286396181384247, | |
| "grad_norm": 0.25209602424956346, | |
| "kl": 0.000377655029296875, | |
| "learning_rate": 9.243881329063434e-07, | |
| "loss": 0.0807, | |
| "reward": 1.6919922828674316, | |
| "reward_std": 0.6165055632591248, | |
| "rewards/": 6.555199146270752, | |
| "rewards/math_compute_score": 0.4761904776096344, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1686.3095703125, | |
| "epoch": 0.2052505966587112, | |
| "grad_norm": 0.2538348128761773, | |
| "kl": 0.0004749298095703125, | |
| "learning_rate": 9.223297290970179e-07, | |
| "loss": 0.0045, | |
| "reward": 1.6205357313156128, | |
| "reward_std": 0.791550874710083, | |
| "rewards/": 6.578869342803955, | |
| "rewards/math_compute_score": 0.380952388048172, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1736.666748046875, | |
| "epoch": 0.20763723150357996, | |
| "grad_norm": 0.23799398657467383, | |
| "kl": 0.000453948974609375, | |
| "learning_rate": 9.202460382960447e-07, | |
| "loss": 0.0187, | |
| "reward": 1.296758770942688, | |
| "reward_std": 0.758968710899353, | |
| "rewards/": 5.912365436553955, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1815.7857666015625, | |
| "epoch": 0.2100238663484487, | |
| "grad_norm": 0.2548996775884122, | |
| "kl": 0.0004329681396484375, | |
| "learning_rate": 9.181371852644063e-07, | |
| "loss": -0.0126, | |
| "reward": 1.1803152561187744, | |
| "reward_std": 0.7856223583221436, | |
| "rewards/": 6.66348123550415, | |
| "rewards/math_compute_score": -0.190476194024086, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1961.047607421875, | |
| "epoch": 0.21241050119331742, | |
| "grad_norm": 0.23616170381807244, | |
| "kl": 0.00052642822265625, | |
| "learning_rate": 9.160032962696734e-07, | |
| "loss": 0.0225, | |
| "reward": 1.0959078073501587, | |
| "reward_std": 0.7875651717185974, | |
| "rewards/": 6.431919574737549, | |
| "rewards/math_compute_score": -0.2380952388048172, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.21479713603818615, | |
| "grad_norm": 0.3028870348853355, | |
| "learning_rate": 9.138444990784453e-07, | |
| "loss": 0.0746, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.21479713603818615, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1827.8512268066406, | |
| "eval_kl": 0.0005598068237304688, | |
| "eval_loss": 0.05406690388917923, | |
| "eval_reward": 1.4557756930589676, | |
| "eval_reward_std": 0.641949962824583, | |
| "eval_rewards/": 6.326497554779053, | |
| "eval_rewards/math_compute_score": 0.2380952462553978, | |
| "eval_runtime": 97.7893, | |
| "eval_samples_per_second": 0.215, | |
| "eval_steps_per_second": 0.01, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1750.9404907226562, | |
| "epoch": 0.2171837708830549, | |
| "grad_norm": 0.28025306135175526, | |
| "kl": 0.000518798828125, | |
| "learning_rate": 9.116609229486991e-07, | |
| "loss": 0.0822, | |
| "reward": 1.4702892303466797, | |
| "reward_std": 0.848765641450882, | |
| "rewards/": 6.399065256118774, | |
| "rewards/math_compute_score": 0.2380952462553978, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1697.2857666015625, | |
| "epoch": 0.21957040572792363, | |
| "grad_norm": 0.2620909431841143, | |
| "kl": 0.000553131103515625, | |
| "learning_rate": 9.094526986220512e-07, | |
| "loss": -0.0109, | |
| "reward": 1.3878443241119385, | |
| "reward_std": 0.7288548946380615, | |
| "rewards/": 6.177316188812256, | |
| "rewards/math_compute_score": 0.190476194024086, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1827.4285888671875, | |
| "epoch": 0.22195704057279236, | |
| "grad_norm": 0.2504018080609149, | |
| "kl": 0.00055694580078125, | |
| "learning_rate": 9.072199583159284e-07, | |
| "loss": 0.0395, | |
| "reward": 1.2157739400863647, | |
| "reward_std": 0.6643213033676147, | |
| "rewards/": 6.269345283508301, | |
| "rewards/math_compute_score": -0.0476190485060215, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1795.3095703125, | |
| "epoch": 0.2243436754176611, | |
| "grad_norm": 0.22970532451736303, | |
| "kl": 0.0004730224609375, | |
| "learning_rate": 9.04962835715652e-07, | |
| "loss": 0.0283, | |
| "reward": 1.2190290689468384, | |
| "reward_std": 0.7647516131401062, | |
| "rewards/": 6.285621166229248, | |
| "rewards/math_compute_score": -0.0476190485060215, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1790.047607421875, | |
| "epoch": 0.22673031026252982, | |
| "grad_norm": 0.21772408919059522, | |
| "kl": 0.00058746337890625, | |
| "learning_rate": 9.02681465966433e-07, | |
| "loss": -0.021, | |
| "reward": 1.447767972946167, | |
| "reward_std": 0.6884434819221497, | |
| "rewards/": 6.476934432983398, | |
| "rewards/math_compute_score": 0.190476194024086, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1935.3809814453125, | |
| "epoch": 0.22911694510739858, | |
| "grad_norm": 0.25618968897371547, | |
| "kl": 0.000701904296875, | |
| "learning_rate": 9.003759856652801e-07, | |
| "loss": 0.0269, | |
| "reward": 1.425186038017273, | |
| "reward_std": 0.651878833770752, | |
| "rewards/": 6.744977951049805, | |
| "rewards/math_compute_score": 0.095238097012043, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1660.0238037109375, | |
| "epoch": 0.2315035799522673, | |
| "grad_norm": 0.2838831684396231, | |
| "kl": 0.000720977783203125, | |
| "learning_rate": 8.980465328528218e-07, | |
| "loss": 0.0505, | |
| "reward": 1.8797248601913452, | |
| "reward_std": 0.4036564528942108, | |
| "rewards/": 7.2081475257873535, | |
| "rewards/math_compute_score": 0.5476190447807312, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1829.8333740234375, | |
| "epoch": 0.23389021479713604, | |
| "grad_norm": 0.22853372167994854, | |
| "kl": 0.00064849853515625, | |
| "learning_rate": 8.956932470050403e-07, | |
| "loss": 0.0511, | |
| "reward": 1.724107265472412, | |
| "reward_std": 0.8355273604393005, | |
| "rewards/": 6.52529764175415, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1856.261962890625, | |
| "epoch": 0.23627684964200477, | |
| "grad_norm": 0.2512391392455568, | |
| "kl": 0.00080108642578125, | |
| "learning_rate": 8.933162690249208e-07, | |
| "loss": 0.0423, | |
| "reward": 1.442262053489685, | |
| "reward_std": 0.709228515625, | |
| "rewards/": 6.449404716491699, | |
| "rewards/math_compute_score": 0.190476194024086, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.2386634844868735, | |
| "grad_norm": 0.26580287893671717, | |
| "learning_rate": 8.909157412340149e-07, | |
| "loss": 0.0522, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2386634844868735, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1771.4226684570312, | |
| "eval_kl": 0.0007572174072265625, | |
| "eval_loss": 0.04274141788482666, | |
| "eval_reward": 1.5543109476566315, | |
| "eval_reward_std": 0.5933023318648338, | |
| "eval_rewards/": 6.509649395942688, | |
| "eval_rewards/math_compute_score": 0.315476194024086, | |
| "eval_runtime": 91.7292, | |
| "eval_samples_per_second": 0.229, | |
| "eval_steps_per_second": 0.011, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1773.857177734375, | |
| "epoch": 0.24105011933174225, | |
| "grad_norm": 0.2651946749721896, | |
| "kl": 0.0009403228759765625, | |
| "learning_rate": 8.884918073639189e-07, | |
| "loss": 0.0451, | |
| "reward": 1.227901816368103, | |
| "reward_std": 0.608531042933464, | |
| "rewards/": 5.949032783508301, | |
| "rewards/math_compute_score": 0.0476190485060215, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1692.7381591796875, | |
| "epoch": 0.24343675417661098, | |
| "grad_norm": 0.25923640399562387, | |
| "kl": 0.00080108642578125, | |
| "learning_rate": 8.860446125476686e-07, | |
| "loss": 0.0513, | |
| "reward": 1.4982887506484985, | |
| "reward_std": 0.3660634756088257, | |
| "rewards/": 6.634300708770752, | |
| "rewards/math_compute_score": 0.2142857164144516, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1747.0, | |
| "epoch": 0.2458233890214797, | |
| "grad_norm": 0.25927402841933767, | |
| "kl": 0.000736236572265625, | |
| "learning_rate": 8.835743033110482e-07, | |
| "loss": 0.0204, | |
| "reward": 1.127715826034546, | |
| "reward_std": 0.7056443691253662, | |
| "rewards/": 6.40048360824585, | |
| "rewards/math_compute_score": -0.190476194024086, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1954.90478515625, | |
| "epoch": 0.24821002386634844, | |
| "grad_norm": 0.23099992586235157, | |
| "kl": 0.0007476806640625, | |
| "learning_rate": 8.810810275638182e-07, | |
| "loss": 0.0478, | |
| "reward": 1.1020090579986572, | |
| "reward_std": 0.8382893800735474, | |
| "rewards/": 6.462425708770752, | |
| "rewards/math_compute_score": -0.2380952388048172, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1456.952392578125, | |
| "epoch": 0.25059665871121717, | |
| "grad_norm": 0.2925044722785809, | |
| "kl": 0.000957489013671875, | |
| "learning_rate": 8.785649345908587e-07, | |
| "loss": 0.0075, | |
| "reward": 1.6823569536209106, | |
| "reward_std": 0.5539329648017883, | |
| "rewards/": 6.3165459632873535, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1802.4761962890625, | |
| "epoch": 0.2529832935560859, | |
| "grad_norm": 0.25996955962455726, | |
| "kl": 0.000904083251953125, | |
| "learning_rate": 8.760261750432312e-07, | |
| "loss": 0.0588, | |
| "reward": 1.4761160612106323, | |
| "reward_std": 0.5810412764549255, | |
| "rewards/": 6.618675708770752, | |
| "rewards/math_compute_score": 0.190476194024086, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1391.2381591796875, | |
| "epoch": 0.2553699284009546, | |
| "grad_norm": 0.2857669697125554, | |
| "kl": 0.00119781494140625, | |
| "learning_rate": 8.734649009291583e-07, | |
| "loss": 0.0018, | |
| "reward": 1.8566220998764038, | |
| "reward_std": 0.41003191471099854, | |
| "rewards/": 6.2354912757873535, | |
| "rewards/math_compute_score": 0.761904776096344, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1657.09521484375, | |
| "epoch": 0.2577565632458234, | |
| "grad_norm": 0.2581521551227329, | |
| "kl": 0.000820159912109375, | |
| "learning_rate": 8.708812656049225e-07, | |
| "loss": 0.0116, | |
| "reward": 1.7328126430511475, | |
| "reward_std": 0.5485076308250427, | |
| "rewards/": 6.28311014175415, | |
| "rewards/math_compute_score": 0.5952380895614624, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1695.3333740234375, | |
| "epoch": 0.26014319809069214, | |
| "grad_norm": 0.22118118373246637, | |
| "kl": 0.000804901123046875, | |
| "learning_rate": 8.68275423765683e-07, | |
| "loss": -0.0055, | |
| "reward": 1.378050684928894, | |
| "reward_std": 0.5073475241661072, | |
| "rewards/": 6.699777126312256, | |
| "rewards/math_compute_score": 0.0476190485060215, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.26252983293556087, | |
| "grad_norm": 0.23367045175580944, | |
| "learning_rate": 8.656475314362147e-07, | |
| "loss": 0.0098, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.26252983293556087, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1761.2619323730469, | |
| "eval_kl": 0.0010433197021484375, | |
| "eval_loss": 0.041358206421136856, | |
| "eval_reward": 1.4955194890499115, | |
| "eval_reward_std": 0.6651220917701721, | |
| "eval_rewards/": 6.382359266281128, | |
| "eval_rewards/math_compute_score": 0.2738095261156559, | |
| "eval_runtime": 91.2376, | |
| "eval_samples_per_second": 0.23, | |
| "eval_steps_per_second": 0.011, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1746.5119018554688, | |
| "epoch": 0.2649164677804296, | |
| "grad_norm": 0.22778632128851484, | |
| "kl": 0.0009899139404296875, | |
| "learning_rate": 8.629977459615654e-07, | |
| "loss": -0.0063, | |
| "reward": 1.8497769236564636, | |
| "reward_std": 0.44625431299209595, | |
| "rewards/": 7.010788679122925, | |
| "rewards/math_compute_score": 0.5595238134264946, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1869.4285888671875, | |
| "epoch": 0.26730310262529833, | |
| "grad_norm": 0.22506830297475983, | |
| "kl": 0.000957489013671875, | |
| "learning_rate": 8.603262259976348e-07, | |
| "loss": 0.0298, | |
| "reward": 1.6217262744903564, | |
| "reward_std": 0.5545719265937805, | |
| "rewards/": 7.15625, | |
| "rewards/math_compute_score": 0.2380952388048172, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1583.8333740234375, | |
| "epoch": 0.26968973747016706, | |
| "grad_norm": 0.26335556959246714, | |
| "kl": 0.001190185546875, | |
| "learning_rate": 8.576331315016751e-07, | |
| "loss": 0.074, | |
| "reward": 1.390029788017273, | |
| "reward_std": 0.7560327053070068, | |
| "rewards/": 6.283482074737549, | |
| "rewards/math_compute_score": 0.1666666716337204, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1648.7857666015625, | |
| "epoch": 0.2720763723150358, | |
| "grad_norm": 0.24404060061496832, | |
| "kl": 0.001220703125, | |
| "learning_rate": 8.549186237227138e-07, | |
| "loss": -0.0057, | |
| "reward": 1.8864582777023315, | |
| "reward_std": 0.5348789691925049, | |
| "rewards/": 7.051339626312256, | |
| "rewards/math_compute_score": 0.5952380895614624, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1813.2381591796875, | |
| "epoch": 0.2744630071599045, | |
| "grad_norm": 0.29199548341678777, | |
| "kl": 0.00124359130859375, | |
| "learning_rate": 8.52182865191898e-07, | |
| "loss": 0.0717, | |
| "reward": 1.410640001296997, | |
| "reward_std": 0.8225697875022888, | |
| "rewards/": 6.481770992279053, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1860.6905517578125, | |
| "epoch": 0.27684964200477324, | |
| "grad_norm": 0.23564871797794462, | |
| "kl": 0.0011138916015625, | |
| "learning_rate": 8.494260197127648e-07, | |
| "loss": -0.0066, | |
| "reward": 1.5276786088943481, | |
| "reward_std": 0.48343804478645325, | |
| "rewards/": 7.066964626312256, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1599.0714111328125, | |
| "epoch": 0.27923627684964203, | |
| "grad_norm": 0.2704337659468572, | |
| "kl": 0.00121307373046875, | |
| "learning_rate": 8.466482523514309e-07, | |
| "loss": 0.0398, | |
| "reward": 1.6256511211395264, | |
| "reward_std": 0.7011668682098389, | |
| "rewards/": 6.699683666229248, | |
| "rewards/math_compute_score": 0.3571428656578064, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1592.857177734375, | |
| "epoch": 0.28162291169451076, | |
| "grad_norm": 0.2476126055211145, | |
| "kl": 0.00128936767578125, | |
| "learning_rate": 8.438497294267116e-07, | |
| "loss": 0.0137, | |
| "reward": 2.154017925262451, | |
| "reward_std": 0.3840785324573517, | |
| "rewards/": 7.531994342803955, | |
| "rewards/math_compute_score": 0.8095238208770752, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1497.952392578125, | |
| "epoch": 0.2840095465393795, | |
| "grad_norm": 0.2830025386276568, | |
| "kl": 0.00151824951171875, | |
| "learning_rate": 8.41030618500161e-07, | |
| "loss": 0.0483, | |
| "reward": 1.8147321939468384, | |
| "reward_std": 0.5045832395553589, | |
| "rewards/": 7.359375, | |
| "rewards/math_compute_score": 0.4285714328289032, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.2863961813842482, | |
| "grad_norm": 0.22278442425352968, | |
| "learning_rate": 8.381910883660399e-07, | |
| "loss": 0.0265, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2863961813842482, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1761.5595397949219, | |
| "eval_kl": 0.0013446807861328125, | |
| "eval_loss": 0.06586353480815887, | |
| "eval_reward": 1.6639322936534882, | |
| "eval_reward_std": 0.5864010900259018, | |
| "eval_rewards/": 6.629185318946838, | |
| "eval_rewards/math_compute_score": 0.42261905781924725, | |
| "eval_runtime": 91.3472, | |
| "eval_samples_per_second": 0.23, | |
| "eval_steps_per_second": 0.011, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1705.65478515625, | |
| "epoch": 0.28878281622911695, | |
| "grad_norm": 0.2508833756487164, | |
| "kl": 0.00110626220703125, | |
| "learning_rate": 8.353313090412091e-07, | |
| "loss": 0.0286, | |
| "reward": 1.5229679942131042, | |
| "reward_std": 0.39858949184417725, | |
| "rewards/": 6.662458419799805, | |
| "rewards/math_compute_score": 0.238095223903656, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1615.547607421875, | |
| "epoch": 0.2911694510739857, | |
| "grad_norm": 0.2961002801426825, | |
| "kl": 0.0015106201171875, | |
| "learning_rate": 8.3245145175495e-07, | |
| "loss": 0.0157, | |
| "reward": 1.5044642686843872, | |
| "reward_std": 0.36025503277778625, | |
| "rewards/": 6.569940567016602, | |
| "rewards/math_compute_score": 0.2380952388048172, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1715.0714111328125, | |
| "epoch": 0.2935560859188544, | |
| "grad_norm": 0.2873657925294743, | |
| "kl": 0.0014190673828125, | |
| "learning_rate": 8.295516889387114e-07, | |
| "loss": 0.0686, | |
| "reward": 1.341320276260376, | |
| "reward_std": 0.5270970463752747, | |
| "rewards/": 6.420886993408203, | |
| "rewards/math_compute_score": 0.0714285746216774, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1663.047607421875, | |
| "epoch": 0.29594272076372313, | |
| "grad_norm": 0.32394764493704503, | |
| "kl": 0.00144195556640625, | |
| "learning_rate": 8.266321942157859e-07, | |
| "loss": 0.0717, | |
| "reward": 1.2409132719039917, | |
| "reward_std": 0.8710536360740662, | |
| "rewards/": 6.204566478729248, | |
| "rewards/math_compute_score": 0.0, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1718.40478515625, | |
| "epoch": 0.29832935560859186, | |
| "grad_norm": 0.26261921476130473, | |
| "kl": 0.0014495849609375, | |
| "learning_rate": 8.236931423909138e-07, | |
| "loss": 0.0276, | |
| "reward": 1.4648065567016602, | |
| "reward_std": 0.8197119832038879, | |
| "rewards/": 6.181175708770752, | |
| "rewards/math_compute_score": 0.2857142984867096, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1603.666748046875, | |
| "epoch": 0.30071599045346065, | |
| "grad_norm": 0.2761289543593296, | |
| "kl": 0.00135040283203125, | |
| "learning_rate": 8.207347094398171e-07, | |
| "loss": 0.0157, | |
| "reward": 1.2167319059371948, | |
| "reward_std": 0.5766604542732239, | |
| "rewards/": 5.893182754516602, | |
| "rewards/math_compute_score": 0.0476190485060215, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1712.166748046875, | |
| "epoch": 0.3031026252983294, | |
| "grad_norm": 0.3314518691922463, | |
| "kl": 0.001708984375, | |
| "learning_rate": 8.177570724986626e-07, | |
| "loss": 0.0594, | |
| "reward": 1.0614583492279053, | |
| "reward_std": 0.4964538514614105, | |
| "rewards/": 5.783482074737549, | |
| "rewards/math_compute_score": -0.1190476194024086, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1777.59521484375, | |
| "epoch": 0.3054892601431981, | |
| "grad_norm": 0.23838654656179914, | |
| "kl": 0.00142669677734375, | |
| "learning_rate": 8.14760409853456e-07, | |
| "loss": -0.001, | |
| "reward": 1.4808967113494873, | |
| "reward_std": 0.532819926738739, | |
| "rewards/": 6.833054542541504, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1779.09521484375, | |
| "epoch": 0.30787589498806683, | |
| "grad_norm": 0.2415045050319174, | |
| "kl": 0.00141143798828125, | |
| "learning_rate": 8.117449009293668e-07, | |
| "loss": 0.0146, | |
| "reward": 1.3048317432403564, | |
| "reward_std": 0.6548949480056763, | |
| "rewards/": 6.524158477783203, | |
| "rewards/math_compute_score": 0.0, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.31026252983293556, | |
| "grad_norm": 0.22090091602582893, | |
| "learning_rate": 8.087107262799855e-07, | |
| "loss": -0.051, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.31026252983293556, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1763.0952758789062, | |
| "eval_kl": 0.001575469970703125, | |
| "eval_loss": 0.01967952772974968, | |
| "eval_reward": 1.662981390953064, | |
| "eval_reward_std": 0.5752230435609818, | |
| "eval_rewards/": 6.529192328453064, | |
| "eval_rewards/math_compute_score": 0.4464285746216774, | |
| "eval_runtime": 98.0654, | |
| "eval_samples_per_second": 0.214, | |
| "eval_steps_per_second": 0.01, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1817.3452758789062, | |
| "epoch": 0.3126491646778043, | |
| "grad_norm": 0.2602875367873934, | |
| "kl": 0.00157928466796875, | |
| "learning_rate": 8.056580675765129e-07, | |
| "loss": 0.0341, | |
| "reward": 1.412472128868103, | |
| "reward_std": 0.7119008004665375, | |
| "rewards/": 6.871884346008301, | |
| "rewards/math_compute_score": 0.04761905036866665, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1185.5238037109375, | |
| "epoch": 0.315035799522673, | |
| "grad_norm": 0.2738706058516209, | |
| "kl": 0.00148773193359375, | |
| "learning_rate": 8.025871075968826e-07, | |
| "loss": 0.0178, | |
| "reward": 1.7256139516830444, | |
| "reward_std": 0.424249529838562, | |
| "rewards/": 5.866164684295654, | |
| "rewards/math_compute_score": 0.6904761791229248, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1664.547607421875, | |
| "epoch": 0.31742243436754175, | |
| "grad_norm": 0.2672055217932093, | |
| "kl": 0.00159454345703125, | |
| "learning_rate": 7.994980302148169e-07, | |
| "loss": 0.0251, | |
| "reward": 1.4857888221740723, | |
| "reward_std": 0.5646493434906006, | |
| "rewards/": 7.0479912757873535, | |
| "rewards/math_compute_score": 0.095238097012043, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1582.1190185546875, | |
| "epoch": 0.3198090692124105, | |
| "grad_norm": 0.23074518957473036, | |
| "kl": 0.0013427734375, | |
| "learning_rate": 7.963910203888176e-07, | |
| "loss": 0.0051, | |
| "reward": 1.954390048980713, | |
| "reward_std": 0.568084180355072, | |
| "rewards/": 6.914806842803955, | |
| "rewards/math_compute_score": 0.7142857313156128, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1632.4285888671875, | |
| "epoch": 0.3221957040572792, | |
| "grad_norm": 0.2984688074510682, | |
| "kl": 0.002105712890625, | |
| "learning_rate": 7.932662641510914e-07, | |
| "loss": 0.0074, | |
| "reward": 1.4273810386657715, | |
| "reward_std": 0.2996509373188019, | |
| "rewards/": 6.470238208770752, | |
| "rewards/math_compute_score": 0.1666666716337204, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1608.90478515625, | |
| "epoch": 0.324582338902148, | |
| "grad_norm": 0.2744118083019373, | |
| "kl": 0.001983642578125, | |
| "learning_rate": 7.90123948596412e-07, | |
| "loss": 0.0955, | |
| "reward": 1.621465802192688, | |
| "reward_std": 0.7447255849838257, | |
| "rewards/": 6.583519458770752, | |
| "rewards/math_compute_score": 0.380952388048172, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1611.0238037109375, | |
| "epoch": 0.3269689737470167, | |
| "grad_norm": 0.2625294125709787, | |
| "kl": 0.001495361328125, | |
| "learning_rate": 7.86964261870916e-07, | |
| "loss": 0.0211, | |
| "reward": 1.5608538389205933, | |
| "reward_std": 0.679740309715271, | |
| "rewards/": 6.566174030303955, | |
| "rewards/math_compute_score": 0.3095238208770752, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1599.9285888671875, | |
| "epoch": 0.32935560859188545, | |
| "grad_norm": 0.3038466252946791, | |
| "kl": 0.00173187255859375, | |
| "learning_rate": 7.837873931608399e-07, | |
| "loss": 0.0589, | |
| "reward": 1.10877525806427, | |
| "reward_std": 0.8366554975509644, | |
| "rewards/": 5.44863748550415, | |
| "rewards/math_compute_score": 0.02380952425301075, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1703.666748046875, | |
| "epoch": 0.3317422434367542, | |
| "grad_norm": 0.25300249483455434, | |
| "kl": 0.0018157958984375, | |
| "learning_rate": 7.805935326811912e-07, | |
| "loss": 0.0349, | |
| "reward": 1.8174108266830444, | |
| "reward_std": 0.490249365568161, | |
| "rewards/": 6.610863208770752, | |
| "rewards/math_compute_score": 0.6190476417541504, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.3341288782816229, | |
| "grad_norm": 0.2417853507095786, | |
| "learning_rate": 7.773828716643592e-07, | |
| "loss": 0.018, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3341288782816229, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1762.2500305175781, | |
| "eval_kl": 0.0018558502197265625, | |
| "eval_loss": 0.025447282940149307, | |
| "eval_reward": 1.6102934777736664, | |
| "eval_reward_std": 0.6778208911418915, | |
| "eval_rewards/": 6.456229090690613, | |
| "eval_rewards/math_compute_score": 0.3988095265813172, | |
| "eval_runtime": 91.0536, | |
| "eval_samples_per_second": 0.231, | |
| "eval_steps_per_second": 0.011, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1680.9880981445312, | |
| "epoch": 0.33651551312649164, | |
| "grad_norm": 0.27816646041180015, | |
| "kl": 0.001827239990234375, | |
| "learning_rate": 7.741556023486654e-07, | |
| "loss": 0.0594, | |
| "reward": 1.5625837445259094, | |
| "reward_std": 0.5163165330886841, | |
| "rewards/": 6.6224424839019775, | |
| "rewards/math_compute_score": 0.2976190522313118, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1580.6190185546875, | |
| "epoch": 0.33890214797136037, | |
| "grad_norm": 0.2833770658693713, | |
| "kl": 0.0019989013671875, | |
| "learning_rate": 7.709119179668537e-07, | |
| "loss": 0.0394, | |
| "reward": 1.7200149297714233, | |
| "reward_std": 0.4698960483074188, | |
| "rewards/": 6.885788917541504, | |
| "rewards/math_compute_score": 0.4285714328289032, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1628.1905517578125, | |
| "epoch": 0.3412887828162291, | |
| "grad_norm": 0.2536982306965292, | |
| "kl": 0.00177764892578125, | |
| "learning_rate": 7.676520127345196e-07, | |
| "loss": 0.0159, | |
| "reward": 1.4838913679122925, | |
| "reward_std": 0.6197465658187866, | |
| "rewards/": 6.562314033508301, | |
| "rewards/math_compute_score": 0.2142857164144516, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1552.3809814453125, | |
| "epoch": 0.3436754176610978, | |
| "grad_norm": 0.31347368028504985, | |
| "kl": 0.00180816650390625, | |
| "learning_rate": 7.643760818384819e-07, | |
| "loss": 0.087, | |
| "reward": 1.6655505895614624, | |
| "reward_std": 0.6516547799110413, | |
| "rewards/": 6.042038917541504, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1977.2857666015625, | |
| "epoch": 0.3460620525059666, | |
| "grad_norm": 0.25241631041768503, | |
| "kl": 0.00189208984375, | |
| "learning_rate": 7.610843214250964e-07, | |
| "loss": 0.0293, | |
| "reward": 1.0822917222976685, | |
| "reward_std": 0.6940091252326965, | |
| "rewards/": 5.982886791229248, | |
| "rewards/math_compute_score": -0.1428571492433548, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1490.0238037109375, | |
| "epoch": 0.34844868735083534, | |
| "grad_norm": 0.2982197191520602, | |
| "kl": 0.0022430419921875, | |
| "learning_rate": 7.577769285885108e-07, | |
| "loss": 0.0155, | |
| "reward": 1.8208333253860474, | |
| "reward_std": 0.4490146338939667, | |
| "rewards/": 6.913690567016602, | |
| "rewards/math_compute_score": 0.5476190447807312, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1673.857177734375, | |
| "epoch": 0.35083532219570407, | |
| "grad_norm": 0.278252727347962, | |
| "kl": 0.002197265625, | |
| "learning_rate": 7.544541013588644e-07, | |
| "loss": 0.0415, | |
| "reward": 1.8191593885421753, | |
| "reward_std": 0.6377332210540771, | |
| "rewards/": 7.2862725257873535, | |
| "rewards/math_compute_score": 0.4523809552192688, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1316.261962890625, | |
| "epoch": 0.3532219570405728, | |
| "grad_norm": 0.28120546569751265, | |
| "kl": 0.0024871826171875, | |
| "learning_rate": 7.511160386904305e-07, | |
| "loss": -0.0168, | |
| "reward": 2.0831844806671143, | |
| "reward_std": 0.187238872051239, | |
| "rewards/": 7.749256134033203, | |
| "rewards/math_compute_score": 0.6666666865348816, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1709.547607421875, | |
| "epoch": 0.3556085918854415, | |
| "grad_norm": 0.23539062329651342, | |
| "kl": 0.001708984375, | |
| "learning_rate": 7.477629404497047e-07, | |
| "loss": 0.0365, | |
| "reward": 1.254538655281067, | |
| "reward_std": 0.7538212537765503, | |
| "rewards/": 6.177455425262451, | |
| "rewards/math_compute_score": 0.02380952425301075, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.35799522673031026, | |
| "grad_norm": 0.24520940371651193, | |
| "learning_rate": 7.443950074034367e-07, | |
| "loss": 0.0149, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.35799522673031026, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1737.2857360839844, | |
| "eval_kl": 0.0021305084228515625, | |
| "eval_loss": 0.042245958000421524, | |
| "eval_reward": 1.562076896429062, | |
| "eval_reward_std": 0.6442549824714661, | |
| "eval_rewards/": 6.334193706512451, | |
| "eval_rewards/math_compute_score": 0.36904762499034405, | |
| "eval_runtime": 90.9442, | |
| "eval_samples_per_second": 0.231, | |
| "eval_steps_per_second": 0.011, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1729.0357055664062, | |
| "epoch": 0.360381861575179, | |
| "grad_norm": 0.26927693948058945, | |
| "kl": 0.0023193359375, | |
| "learning_rate": 7.41012441206611e-07, | |
| "loss": -0.0032, | |
| "reward": 1.4532668590545654, | |
| "reward_std": 0.5979789793491364, | |
| "rewards/": 6.504429578781128, | |
| "rewards/math_compute_score": 0.19047619495540857, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1618.3333740234375, | |
| "epoch": 0.3627684964200477, | |
| "grad_norm": 0.27430715469286404, | |
| "kl": 0.002349853515625, | |
| "learning_rate": 7.376154443903713e-07, | |
| "loss": 0.0893, | |
| "reward": 1.5656062364578247, | |
| "reward_std": 0.6696694493293762, | |
| "rewards/": 6.7804131507873535, | |
| "rewards/math_compute_score": 0.261904776096344, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1713.1429443359375, | |
| "epoch": 0.36515513126491644, | |
| "grad_norm": 0.24192656702948537, | |
| "kl": 0.00191497802734375, | |
| "learning_rate": 7.342042203498951e-07, | |
| "loss": 0.0093, | |
| "reward": 1.3910435438156128, | |
| "reward_std": 0.8083306550979614, | |
| "rewards/": 6.574265480041504, | |
| "rewards/math_compute_score": 0.095238097012043, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1752.547607421875, | |
| "epoch": 0.36754176610978523, | |
| "grad_norm": 0.23529364280462375, | |
| "kl": 0.00177764892578125, | |
| "learning_rate": 7.307789733322145e-07, | |
| "loss": 0.0412, | |
| "reward": 1.2998976707458496, | |
| "reward_std": 0.6855893135070801, | |
| "rewards/": 6.499488353729248, | |
| "rewards/math_compute_score": 0.0, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1709.09521484375, | |
| "epoch": 0.36992840095465396, | |
| "grad_norm": 0.2565888286348459, | |
| "kl": 0.0020294189453125, | |
| "learning_rate": 7.273399084239878e-07, | |
| "loss": 0.0343, | |
| "reward": 1.2922619581222534, | |
| "reward_std": 0.5724942684173584, | |
| "rewards/": 6.842262268066406, | |
| "rewards/math_compute_score": -0.095238097012043, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1770.0238037109375, | |
| "epoch": 0.3723150357995227, | |
| "grad_norm": 0.2556634342782622, | |
| "kl": 0.0023651123046875, | |
| "learning_rate": 7.238872315392189e-07, | |
| "loss": 0.0484, | |
| "reward": 1.8619048595428467, | |
| "reward_std": 0.4541710913181305, | |
| "rewards/": 7.214285850524902, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1747.21435546875, | |
| "epoch": 0.3747016706443914, | |
| "grad_norm": 0.23922842509554956, | |
| "kl": 0.00201416015625, | |
| "learning_rate": 7.204211494069291e-07, | |
| "loss": -0.0377, | |
| "reward": 1.7935267686843872, | |
| "reward_std": 0.5942096710205078, | |
| "rewards/": 7.062871932983398, | |
| "rewards/math_compute_score": 0.4761904776096344, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1375.4285888671875, | |
| "epoch": 0.37708830548926014, | |
| "grad_norm": 0.2504129627401833, | |
| "kl": 0.002655029296875, | |
| "learning_rate": 7.16941869558779e-07, | |
| "loss": 0.0313, | |
| "reward": 2.2074406147003174, | |
| "reward_std": 0.2237553596496582, | |
| "rewards/": 7.418154716491699, | |
| "rewards/math_compute_score": 0.9047619104385376, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1834.4761962890625, | |
| "epoch": 0.3794749403341289, | |
| "grad_norm": 0.22770506606151802, | |
| "kl": 0.00201416015625, | |
| "learning_rate": 7.134496003166423e-07, | |
| "loss": 0.0172, | |
| "reward": 1.2007441520690918, | |
| "reward_std": 0.6796280741691589, | |
| "rewards/": 6.194196701049805, | |
| "rewards/math_compute_score": -0.0476190485060215, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.3818615751789976, | |
| "grad_norm": 0.27823803082361936, | |
| "learning_rate": 7.099445507801323e-07, | |
| "loss": 0.0394, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3818615751789976, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1710.9940795898438, | |
| "eval_kl": 0.002544403076171875, | |
| "eval_loss": 0.03967365622520447, | |
| "eval_reward": 1.7245815098285675, | |
| "eval_reward_std": 0.6458448991179466, | |
| "eval_rewards/": 6.646717071533203, | |
| "eval_rewards/math_compute_score": 0.4940476305782795, | |
| "eval_runtime": 90.5014, | |
| "eval_samples_per_second": 0.232, | |
| "eval_steps_per_second": 0.011, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1524.2262573242188, | |
| "epoch": 0.38424821002386633, | |
| "grad_norm": 0.28852725034404547, | |
| "kl": 0.0025177001953125, | |
| "learning_rate": 7.064269308140829e-07, | |
| "loss": 0.0211, | |
| "reward": 1.7621653079986572, | |
| "reward_std": 0.4823741465806961, | |
| "rewards/": 7.048921346664429, | |
| "rewards/math_compute_score": 0.4404761865735054, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1638.952392578125, | |
| "epoch": 0.38663484486873506, | |
| "grad_norm": 0.25636209494281637, | |
| "kl": 0.002410888671875, | |
| "learning_rate": 7.02896951035982e-07, | |
| "loss": 0.044, | |
| "reward": 1.4277158975601196, | |
| "reward_std": 0.5193288922309875, | |
| "rewards/": 6.852864742279053, | |
| "rewards/math_compute_score": 0.0714285746216774, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1781.119140625, | |
| "epoch": 0.38902147971360385, | |
| "grad_norm": 0.217013334112183, | |
| "kl": 0.00238037109375, | |
| "learning_rate": 6.993548228033617e-07, | |
| "loss": 0.0355, | |
| "reward": 1.5361608266830444, | |
| "reward_std": 0.46594858169555664, | |
| "rewards/": 6.347470283508301, | |
| "rewards/math_compute_score": 0.3333333432674408, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1553.2381591796875, | |
| "epoch": 0.3914081145584726, | |
| "grad_norm": 0.24395443915530762, | |
| "kl": 0.00225830078125, | |
| "learning_rate": 6.958007582011424e-07, | |
| "loss": 0.0249, | |
| "reward": 2.022023916244507, | |
| "reward_std": 0.3801310360431671, | |
| "rewards/": 7.6339287757873535, | |
| "rewards/math_compute_score": 0.6190476417541504, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1617.8095703125, | |
| "epoch": 0.3937947494033413, | |
| "grad_norm": 0.24004403569382207, | |
| "kl": 0.0025787353515625, | |
| "learning_rate": 6.922349700289347e-07, | |
| "loss": 0.0256, | |
| "reward": 1.8187501430511475, | |
| "reward_std": 0.47890540957450867, | |
| "rewards/": 6.998512268066406, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1533.261962890625, | |
| "epoch": 0.39618138424821003, | |
| "grad_norm": 0.23219253681724725, | |
| "kl": 0.002532958984375, | |
| "learning_rate": 6.886576717882981e-07, | |
| "loss": -0.0059, | |
| "reward": 1.9263392686843872, | |
| "reward_std": 0.4234418570995331, | |
| "rewards/": 6.8697919845581055, | |
| "rewards/math_compute_score": 0.6904761791229248, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1722.21435546875, | |
| "epoch": 0.39856801909307876, | |
| "grad_norm": 0.2695428246946182, | |
| "kl": 0.0027008056640625, | |
| "learning_rate": 6.850690776699573e-07, | |
| "loss": -0.002, | |
| "reward": 1.3836426734924316, | |
| "reward_std": 0.5190478563308716, | |
| "rewards/": 6.346784591674805, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1703.90478515625, | |
| "epoch": 0.4009546539379475, | |
| "grad_norm": 0.2480850316910639, | |
| "kl": 0.0023040771484375, | |
| "learning_rate": 6.814694025409773e-07, | |
| "loss": 0.0089, | |
| "reward": 1.8032739162445068, | |
| "reward_std": 0.4905838966369629, | |
| "rewards/": 7.111607074737549, | |
| "rewards/math_compute_score": 0.4761904776096344, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1487.3333740234375, | |
| "epoch": 0.4033412887828162, | |
| "grad_norm": 0.2441614041062561, | |
| "kl": 0.002593994140625, | |
| "learning_rate": 6.778588619318993e-07, | |
| "loss": 0.0052, | |
| "reward": 1.5892950296401978, | |
| "reward_std": 0.679408609867096, | |
| "rewards/": 5.8512372970581055, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.40572792362768495, | |
| "grad_norm": 0.28274768118904486, | |
| "learning_rate": 6.742376720238346e-07, | |
| "loss": -0.0091, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.40572792362768495, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1709.7024230957031, | |
| "eval_kl": 0.0027008056640625, | |
| "eval_loss": 0.03433432802557945, | |
| "eval_reward": 1.67239710688591, | |
| "eval_reward_std": 0.5494325160980225, | |
| "eval_rewards/": 6.457223296165466, | |
| "eval_rewards/math_compute_score": 0.47619048599153757, | |
| "eval_runtime": 90.002, | |
| "eval_samples_per_second": 0.233, | |
| "eval_steps_per_second": 0.011, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1721.3690795898438, | |
| "epoch": 0.4081145584725537, | |
| "grad_norm": 0.2507835886818597, | |
| "kl": 0.00286865234375, | |
| "learning_rate": 6.706060496355211e-07, | |
| "loss": 0.0396, | |
| "reward": 1.640829622745514, | |
| "reward_std": 0.6370173096656799, | |
| "rewards/": 6.680338621139526, | |
| "rewards/math_compute_score": 0.380952388048172, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1680.547607421875, | |
| "epoch": 0.4105011933174224, | |
| "grad_norm": 0.24695563782362182, | |
| "kl": 0.0029449462890625, | |
| "learning_rate": 6.669642122103422e-07, | |
| "loss": 0.0106, | |
| "reward": 1.5374256372451782, | |
| "reward_std": 0.48627233505249023, | |
| "rewards/": 6.734746932983398, | |
| "rewards/math_compute_score": 0.2380952388048172, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1830.261962890625, | |
| "epoch": 0.4128878281622912, | |
| "grad_norm": 0.27668091559057234, | |
| "kl": 0.0030975341796875, | |
| "learning_rate": 6.633123778033061e-07, | |
| "loss": 0.0415, | |
| "reward": 1.9467262029647827, | |
| "reward_std": 0.5212621092796326, | |
| "rewards/": 6.876488208770752, | |
| "rewards/math_compute_score": 0.7142857313156128, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1621.71435546875, | |
| "epoch": 0.4152744630071599, | |
| "grad_norm": 0.27868551362271904, | |
| "kl": 0.0023345947265625, | |
| "learning_rate": 6.596507650679899e-07, | |
| "loss": 0.0289, | |
| "reward": 1.2761160135269165, | |
| "reward_std": 0.6406970620155334, | |
| "rewards/": 6.380580425262451, | |
| "rewards/math_compute_score": 0.0, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1595.8095703125, | |
| "epoch": 0.41766109785202865, | |
| "grad_norm": 0.2972079507840572, | |
| "kl": 0.0031585693359375, | |
| "learning_rate": 6.559795932434488e-07, | |
| "loss": 0.0709, | |
| "reward": 1.8078126907348633, | |
| "reward_std": 0.5548811554908752, | |
| "rewards/": 6.943824768066406, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1457.666748046875, | |
| "epoch": 0.4200477326968974, | |
| "grad_norm": 0.2279280487796575, | |
| "kl": 0.0028076171875, | |
| "learning_rate": 6.52299082141088e-07, | |
| "loss": -0.0355, | |
| "reward": 2.040308952331543, | |
| "reward_std": 0.5329591631889343, | |
| "rewards/": 7.05868673324585, | |
| "rewards/math_compute_score": 0.785714328289032, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1632.261962890625, | |
| "epoch": 0.4224343675417661, | |
| "grad_norm": 0.24847601288466012, | |
| "kl": 0.0029449462890625, | |
| "learning_rate": 6.486094521315021e-07, | |
| "loss": 0.0605, | |
| "reward": 1.9953869581222534, | |
| "reward_std": 0.5443364381790161, | |
| "rewards/": 7.1197919845581055, | |
| "rewards/math_compute_score": 0.7142857313156128, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1861.4285888671875, | |
| "epoch": 0.42482100238663484, | |
| "grad_norm": 0.22918857863503886, | |
| "kl": 0.002655029296875, | |
| "learning_rate": 6.449109241312802e-07, | |
| "loss": 0.003, | |
| "reward": 1.405282735824585, | |
| "reward_std": 0.6072686314582825, | |
| "rewards/": 6.074032783508301, | |
| "rewards/math_compute_score": 0.2380952388048172, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1707.8809814453125, | |
| "epoch": 0.42720763723150357, | |
| "grad_norm": 0.2734274914799027, | |
| "kl": 0.002685546875, | |
| "learning_rate": 6.412037195897785e-07, | |
| "loss": 0.0409, | |
| "reward": 1.6891371011734009, | |
| "reward_std": 0.5659449100494385, | |
| "rewards/": 7.5885419845581055, | |
| "rewards/math_compute_score": 0.2142857164144516, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.4295942720763723, | |
| "grad_norm": 0.25298589420589623, | |
| "learning_rate": 6.374880604758614e-07, | |
| "loss": -0.0146, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4295942720763723, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1732.7916870117188, | |
| "eval_kl": 0.002941131591796875, | |
| "eval_loss": 0.04335374757647514, | |
| "eval_reward": 1.724925696849823, | |
| "eval_reward_std": 0.5939712524414062, | |
| "eval_rewards/": 6.6722471714019775, | |
| "eval_rewards/math_compute_score": 0.4880952462553978, | |
| "eval_runtime": 90.8058, | |
| "eval_samples_per_second": 0.231, | |
| "eval_steps_per_second": 0.011, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1528.4166870117188, | |
| "epoch": 0.431980906921241, | |
| "grad_norm": 0.31782017197338924, | |
| "kl": 0.0032196044921875, | |
| "learning_rate": 6.337641692646106e-07, | |
| "loss": 0.0545, | |
| "reward": 1.7690011262893677, | |
| "reward_std": 0.47228382527828217, | |
| "rewards/": 7.130719900131226, | |
| "rewards/math_compute_score": 0.4285714477300644, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1502.8095703125, | |
| "epoch": 0.4343675417661098, | |
| "grad_norm": 0.26689710039208736, | |
| "kl": 0.003662109375, | |
| "learning_rate": 6.300322689240041e-07, | |
| "loss": 0.0446, | |
| "reward": 1.989508867263794, | |
| "reward_std": 0.41250666975975037, | |
| "rewards/": 6.518973350524902, | |
| "rewards/math_compute_score": 0.8571428656578064, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1575.452392578125, | |
| "epoch": 0.43675417661097854, | |
| "grad_norm": 0.27786153846351275, | |
| "kl": 0.003875732421875, | |
| "learning_rate": 6.262925829015675e-07, | |
| "loss": -0.0018, | |
| "reward": 1.7005953788757324, | |
| "reward_std": 0.36450114846229553, | |
| "rewards/": 7.264881134033203, | |
| "rewards/math_compute_score": 0.3095238208770752, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1625.8809814453125, | |
| "epoch": 0.43914081145584727, | |
| "grad_norm": 0.2596912738373031, | |
| "kl": 0.0035247802734375, | |
| "learning_rate": 6.225453351109934e-07, | |
| "loss": -0.0137, | |
| "reward": 1.7954614162445068, | |
| "reward_std": 0.3381480872631073, | |
| "rewards/": 7.263020992279053, | |
| "rewards/math_compute_score": 0.4285714328289032, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1478.6190185546875, | |
| "epoch": 0.441527446300716, | |
| "grad_norm": 0.286737248058935, | |
| "kl": 0.0033416748046875, | |
| "learning_rate": 6.187907499187356e-07, | |
| "loss": 0.0023, | |
| "reward": 1.675409197807312, | |
| "reward_std": 0.4261094629764557, | |
| "rewards/": 7.043713092803955, | |
| "rewards/math_compute_score": 0.3333333432674408, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1754.2857666015625, | |
| "epoch": 0.4439140811455847, | |
| "grad_norm": 0.31883404722867287, | |
| "kl": 0.003570556640625, | |
| "learning_rate": 6.150290521305745e-07, | |
| "loss": 0.0083, | |
| "reward": 1.1777018308639526, | |
| "reward_std": 0.5951432585716248, | |
| "rewards/": 6.55517578125, | |
| "rewards/math_compute_score": -0.1666666716337204, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1574.8333740234375, | |
| "epoch": 0.44630071599045346, | |
| "grad_norm": 0.2623240542860342, | |
| "kl": 0.0038604736328125, | |
| "learning_rate": 6.112604669781572e-07, | |
| "loss": 0.0151, | |
| "reward": 2.1099700927734375, | |
| "reward_std": 0.4681752920150757, | |
| "rewards/": 7.502232074737549, | |
| "rewards/math_compute_score": 0.761904776096344, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1458.71435546875, | |
| "epoch": 0.4486873508353222, | |
| "grad_norm": 0.2584305452560197, | |
| "kl": 0.0034027099609375, | |
| "learning_rate": 6.074852201055121e-07, | |
| "loss": 0.0251, | |
| "reward": 1.9345983266830444, | |
| "reward_std": 0.48896414041519165, | |
| "rewards/": 7.387277126312256, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1851.6905517578125, | |
| "epoch": 0.4510739856801909, | |
| "grad_norm": 0.23937901273037387, | |
| "kl": 0.0029449462890625, | |
| "learning_rate": 6.037035375555375e-07, | |
| "loss": 0.0495, | |
| "reward": 1.6072173118591309, | |
| "reward_std": 0.6452977657318115, | |
| "rewards/": 6.8932294845581055, | |
| "rewards/math_compute_score": 0.2857142984867096, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.45346062052505964, | |
| "grad_norm": 0.2790819447741767, | |
| "learning_rate": 5.999156457564685e-07, | |
| "loss": 0.0699, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.45346062052505964, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1715.4345703125, | |
| "eval_kl": 0.0034027099609375, | |
| "eval_loss": 0.029469896107912064, | |
| "eval_reward": 1.7646177113056183, | |
| "eval_reward_std": 0.5356989577412605, | |
| "eval_rewards/": 6.751659631729126, | |
| "eval_rewards/math_compute_score": 0.5178571455180645, | |
| "eval_runtime": 90.5105, | |
| "eval_samples_per_second": 0.232, | |
| "eval_steps_per_second": 0.011, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1566.2857666015625, | |
| "epoch": 0.45584725536992843, | |
| "grad_norm": 0.26673369685055265, | |
| "kl": 0.0030975341796875, | |
| "learning_rate": 5.961217715083184e-07, | |
| "loss": -0.0247, | |
| "reward": 1.439574122428894, | |
| "reward_std": 0.4712224751710892, | |
| "rewards/": 6.388346433639526, | |
| "rewards/math_compute_score": 0.20238095708191395, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1669.6905517578125, | |
| "epoch": 0.45823389021479716, | |
| "grad_norm": 0.288642662443768, | |
| "kl": 0.00396728515625, | |
| "learning_rate": 5.923221419693001e-07, | |
| "loss": 0.055, | |
| "reward": 1.3469215631484985, | |
| "reward_std": 0.6074704527854919, | |
| "rewards/": 6.3536553382873535, | |
| "rewards/math_compute_score": 0.095238097012043, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1724.9761962890625, | |
| "epoch": 0.4606205250596659, | |
| "grad_norm": 0.2442715266509315, | |
| "kl": 0.0036773681640625, | |
| "learning_rate": 5.885169846422241e-07, | |
| "loss": 0.0315, | |
| "reward": 1.85975182056427, | |
| "reward_std": 0.6531968712806702, | |
| "rewards/": 6.822567939758301, | |
| "rewards/math_compute_score": 0.6190476417541504, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1360.5238037109375, | |
| "epoch": 0.4630071599045346, | |
| "grad_norm": 0.24247369346072625, | |
| "kl": 0.0033416748046875, | |
| "learning_rate": 5.847065273608777e-07, | |
| "loss": -0.0394, | |
| "reward": 2.0778274536132812, | |
| "reward_std": 0.3715685307979584, | |
| "rewards/": 7.246279716491699, | |
| "rewards/math_compute_score": 0.785714328289032, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1426.1429443359375, | |
| "epoch": 0.46539379474940334, | |
| "grad_norm": 0.3036683766801178, | |
| "kl": 0.003997802734375, | |
| "learning_rate": 5.808909982763825e-07, | |
| "loss": 0.0353, | |
| "reward": 2.0206658840179443, | |
| "reward_std": 0.4580070972442627, | |
| "rewards/": 6.865234375, | |
| "rewards/math_compute_score": 0.8095238208770752, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1853.0, | |
| "epoch": 0.4677804295942721, | |
| "grad_norm": 0.24630009542019882, | |
| "kl": 0.0035552978515625, | |
| "learning_rate": 5.770706258435342e-07, | |
| "loss": 0.0068, | |
| "reward": 1.443489670753479, | |
| "reward_std": 0.6034876108169556, | |
| "rewards/": 6.836495876312256, | |
| "rewards/math_compute_score": 0.095238097012043, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1722.3809814453125, | |
| "epoch": 0.4701670644391408, | |
| "grad_norm": 0.2804061267617125, | |
| "kl": 0.0036468505859375, | |
| "learning_rate": 5.732456388071246e-07, | |
| "loss": 0.0911, | |
| "reward": 1.7890625, | |
| "reward_std": 0.8560119271278381, | |
| "rewards/": 7.231027126312256, | |
| "rewards/math_compute_score": 0.4285714328289032, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1708.0, | |
| "epoch": 0.47255369928400953, | |
| "grad_norm": 0.23422157987379022, | |
| "kl": 0.0026702880859375, | |
| "learning_rate": 5.694162661882443e-07, | |
| "loss": 0.0098, | |
| "reward": 1.418210506439209, | |
| "reward_std": 0.5666205286979675, | |
| "rewards/": 6.51962423324585, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1664.6905517578125, | |
| "epoch": 0.47494033412887826, | |
| "grad_norm": 0.27629085755921956, | |
| "kl": 0.0035400390625, | |
| "learning_rate": 5.655827372705711e-07, | |
| "loss": 0.0046, | |
| "reward": 1.649553656578064, | |
| "reward_std": 0.35802432894706726, | |
| "rewards/": 7.104910850524902, | |
| "rewards/math_compute_score": 0.2857142984867096, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.477326968973747, | |
| "grad_norm": 0.26492233859737124, | |
| "learning_rate": 5.617452815866409e-07, | |
| "loss": 0.0269, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.477326968973747, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1717.8690795898438, | |
| "eval_kl": 0.0035858154296875, | |
| "eval_loss": 0.018599843606352806, | |
| "eval_reward": 1.7809989750385284, | |
| "eval_reward_std": 0.5547928586602211, | |
| "eval_rewards/": 6.857375502586365, | |
| "eval_rewards/math_compute_score": 0.511904776096344, | |
| "eval_runtime": 89.9494, | |
| "eval_samples_per_second": 0.233, | |
| "eval_steps_per_second": 0.011, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1621.952392578125, | |
| "epoch": 0.4797136038186158, | |
| "grad_norm": 0.2851430586205352, | |
| "kl": 0.00341033935546875, | |
| "learning_rate": 5.579041289041045e-07, | |
| "loss": 0.041, | |
| "reward": 1.6838914155960083, | |
| "reward_std": 0.5416805893182755, | |
| "rewards/": 6.9908857345581055, | |
| "rewards/math_compute_score": 0.357142873108387, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1644.59521484375, | |
| "epoch": 0.4821002386634845, | |
| "grad_norm": 0.22556906762590703, | |
| "kl": 0.0029449462890625, | |
| "learning_rate": 5.540595092119708e-07, | |
| "loss": -0.0015, | |
| "reward": 1.4434523582458496, | |
| "reward_std": 0.36706992983818054, | |
| "rewards/": 6.836309432983398, | |
| "rewards/math_compute_score": 0.095238097012043, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1847.2857666015625, | |
| "epoch": 0.48448687350835323, | |
| "grad_norm": 0.2540668611221056, | |
| "kl": 0.0035858154296875, | |
| "learning_rate": 5.502116527068362e-07, | |
| "loss": 0.0057, | |
| "reward": 1.016341209411621, | |
| "reward_std": 0.497652530670166, | |
| "rewards/": 6.605515480041504, | |
| "rewards/math_compute_score": -0.380952388048172, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1572.71435546875, | |
| "epoch": 0.48687350835322196, | |
| "grad_norm": 0.26513430034135327, | |
| "kl": 0.004119873046875, | |
| "learning_rate": 5.463607897791005e-07, | |
| "loss": 0.0501, | |
| "reward": 1.8174108266830444, | |
| "reward_std": 0.49505147337913513, | |
| "rewards/": 6.991815567016602, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1550.0, | |
| "epoch": 0.4892601431980907, | |
| "grad_norm": 0.2697202949991492, | |
| "kl": 0.003753662109375, | |
| "learning_rate": 5.425071509991736e-07, | |
| "loss": 0.0289, | |
| "reward": 1.7891370058059692, | |
| "reward_std": 0.45509421825408936, | |
| "rewards/": 6.850446701049805, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1605.357177734375, | |
| "epoch": 0.4916467780429594, | |
| "grad_norm": 0.26407614115571215, | |
| "kl": 0.003936767578125, | |
| "learning_rate": 5.386509671036695e-07, | |
| "loss": 0.0412, | |
| "reward": 1.5840773582458496, | |
| "reward_std": 0.44521623849868774, | |
| "rewards/": 7.253720283508301, | |
| "rewards/math_compute_score": 0.1666666716337204, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1551.5714111328125, | |
| "epoch": 0.49403341288782815, | |
| "grad_norm": 0.24883737154825586, | |
| "kl": 0.0035247802734375, | |
| "learning_rate": 5.347924689815906e-07, | |
| "loss": -0.023, | |
| "reward": 1.7059524059295654, | |
| "reward_std": 0.4412461817264557, | |
| "rewards/": 6.910714626312256, | |
| "rewards/math_compute_score": 0.4047619104385376, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1707.59521484375, | |
| "epoch": 0.4964200477326969, | |
| "grad_norm": 0.26824875622279615, | |
| "kl": 0.003326416015625, | |
| "learning_rate": 5.309318876605042e-07, | |
| "loss": 0.0433, | |
| "reward": 1.2729166746139526, | |
| "reward_std": 0.6343668103218079, | |
| "rewards/": 6.745535850524902, | |
| "rewards/math_compute_score": -0.095238097012043, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1499.952392578125, | |
| "epoch": 0.4988066825775656, | |
| "grad_norm": 0.2552057559316855, | |
| "kl": 0.0040283203125, | |
| "learning_rate": 5.270694542927088e-07, | |
| "loss": 0.0605, | |
| "reward": 1.950334906578064, | |
| "reward_std": 0.6608874797821045, | |
| "rewards/": 7.085007667541504, | |
| "rewards/math_compute_score": 0.6666666865348816, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.5011933174224343, | |
| "grad_norm": 0.26825451466871253, | |
| "learning_rate": 5.232054001413941e-07, | |
| "loss": 0.0836, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5011933174224343, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1666.1667175292969, | |
| "eval_kl": 0.00366973876953125, | |
| "eval_loss": 0.032919324934482574, | |
| "eval_reward": 1.8185571432113647, | |
| "eval_reward_std": 0.553740456700325, | |
| "eval_rewards/": 6.854689955711365, | |
| "eval_rewards/math_compute_score": 0.5595238246023655, | |
| "eval_runtime": 89.3052, | |
| "eval_samples_per_second": 0.235, | |
| "eval_steps_per_second": 0.011, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1667.202392578125, | |
| "epoch": 0.5035799522673031, | |
| "grad_norm": 0.2303140614245728, | |
| "kl": 0.0036773681640625, | |
| "learning_rate": 5.193399565667944e-07, | |
| "loss": -0.0107, | |
| "reward": 1.8566593527793884, | |
| "reward_std": 0.5822675228118896, | |
| "rewards/": 6.902343988418579, | |
| "rewards/math_compute_score": 0.5952381044626236, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1385.21435546875, | |
| "epoch": 0.5059665871121718, | |
| "grad_norm": 0.26853565694319703, | |
| "kl": 0.004119873046875, | |
| "learning_rate": 5.154733550123355e-07, | |
| "loss": 0.0248, | |
| "reward": 1.8717262744903564, | |
| "reward_std": 0.28672024607658386, | |
| "rewards/": 6.882440567016602, | |
| "rewards/math_compute_score": 0.6190476417541504, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1742.666748046875, | |
| "epoch": 0.5083532219570406, | |
| "grad_norm": 0.30033605387329526, | |
| "kl": 0.004180908203125, | |
| "learning_rate": 5.116058269907778e-07, | |
| "loss": 0.0691, | |
| "reward": 1.3494606018066406, | |
| "reward_std": 0.6003190875053406, | |
| "rewards/": 6.556826591491699, | |
| "rewards/math_compute_score": 0.0476190485060215, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1593.666748046875, | |
| "epoch": 0.5107398568019093, | |
| "grad_norm": 0.24942776683135082, | |
| "kl": 0.00311279296875, | |
| "learning_rate": 5.077376040703532e-07, | |
| "loss": 0.0176, | |
| "reward": 1.759412169456482, | |
| "reward_std": 0.6285237073898315, | |
| "rewards/": 6.7018232345581055, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1642.8809814453125, | |
| "epoch": 0.513126491646778, | |
| "grad_norm": 0.24962515536899138, | |
| "kl": 0.0032196044921875, | |
| "learning_rate": 5.038689178609011e-07, | |
| "loss": 0.0138, | |
| "reward": 1.337165117263794, | |
| "reward_std": 0.5744600892066956, | |
| "rewards/": 6.400111675262451, | |
| "rewards/math_compute_score": 0.0714285746216774, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1622.40478515625, | |
| "epoch": 0.5155131264916468, | |
| "grad_norm": 0.30908520647310384, | |
| "kl": 0.00433349609375, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0464, | |
| "reward": 1.8487166166305542, | |
| "reward_std": 0.5867566466331482, | |
| "rewards/": 6.576916217803955, | |
| "rewards/math_compute_score": 0.6666666865348816, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1577.09521484375, | |
| "epoch": 0.5178997613365155, | |
| "grad_norm": 0.32056320412809136, | |
| "kl": 0.005523681640625, | |
| "learning_rate": 4.961310821390989e-07, | |
| "loss": 0.0627, | |
| "reward": 1.755134105682373, | |
| "reward_std": 0.5534998774528503, | |
| "rewards/": 6.966145992279053, | |
| "rewards/math_compute_score": 0.4523809552192688, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1711.7381591796875, | |
| "epoch": 0.5202863961813843, | |
| "grad_norm": 0.2765241956924031, | |
| "kl": 0.003692626953125, | |
| "learning_rate": 4.922623959296468e-07, | |
| "loss": 0.0325, | |
| "reward": 1.5642856359481812, | |
| "reward_std": 0.6228559017181396, | |
| "rewards/": 7.059524059295654, | |
| "rewards/math_compute_score": 0.190476194024086, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1475.5714111328125, | |
| "epoch": 0.522673031026253, | |
| "grad_norm": 0.3255635099084251, | |
| "kl": 0.004364013671875, | |
| "learning_rate": 4.883941730092221e-07, | |
| "loss": 0.0906, | |
| "reward": 2.1429688930511475, | |
| "reward_std": 0.4786287546157837, | |
| "rewards/": 7.476748466491699, | |
| "rewards/math_compute_score": 0.8095238208770752, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.5250596658711217, | |
| "grad_norm": 0.2590935031754961, | |
| "learning_rate": 4.845266449876645e-07, | |
| "loss": -0.0051, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5250596658711217, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1727.5059814453125, | |
| "eval_kl": 0.004108428955078125, | |
| "eval_loss": 0.06444509327411652, | |
| "eval_reward": 1.802697241306305, | |
| "eval_reward_std": 0.5698123574256897, | |
| "eval_rewards/": 7.108723998069763, | |
| "eval_rewards/math_compute_score": 0.47619048319756985, | |
| "eval_runtime": 90.4516, | |
| "eval_samples_per_second": 0.232, | |
| "eval_steps_per_second": 0.011, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1693.4761962890625, | |
| "epoch": 0.5274463007159904, | |
| "grad_norm": 0.24423068905597062, | |
| "kl": 0.00394439697265625, | |
| "learning_rate": 4.806600434332056e-07, | |
| "loss": 0.014, | |
| "reward": 1.3321336507797241, | |
| "reward_std": 0.5291797071695328, | |
| "rewards/": 6.660667896270752, | |
| "rewards/math_compute_score": 0.0, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1652.5238037109375, | |
| "epoch": 0.5298329355608592, | |
| "grad_norm": 0.27797504178630167, | |
| "kl": 0.004608154296875, | |
| "learning_rate": 4.76794599858606e-07, | |
| "loss": 0.0612, | |
| "reward": 1.6118676662445068, | |
| "reward_std": 0.7166314721107483, | |
| "rewards/": 6.726004600524902, | |
| "rewards/math_compute_score": 0.3333333432674408, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1610.0238037109375, | |
| "epoch": 0.5322195704057279, | |
| "grad_norm": 0.24481283197859993, | |
| "kl": 0.0047607421875, | |
| "learning_rate": 4.7293054570729126e-07, | |
| "loss": 0.0891, | |
| "reward": 1.7377232313156128, | |
| "reward_std": 0.3753490149974823, | |
| "rewards/": 6.402902126312256, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1788.6905517578125, | |
| "epoch": 0.5346062052505967, | |
| "grad_norm": 0.25068052940129437, | |
| "kl": 0.00421142578125, | |
| "learning_rate": 4.690681123394958e-07, | |
| "loss": -0.0178, | |
| "reward": 1.6801340579986572, | |
| "reward_std": 0.6131526231765747, | |
| "rewards/": 7.448288917541504, | |
| "rewards/math_compute_score": 0.2380952388048172, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1668.2857666015625, | |
| "epoch": 0.5369928400954654, | |
| "grad_norm": 0.24959786295483227, | |
| "kl": 0.004180908203125, | |
| "learning_rate": 4.6520753101840937e-07, | |
| "loss": 0.0296, | |
| "reward": 1.7870535850524902, | |
| "reward_std": 0.4355296194553375, | |
| "rewards/": 6.935267925262451, | |
| "rewards/math_compute_score": 0.5, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1850.119140625, | |
| "epoch": 0.5393794749403341, | |
| "grad_norm": 0.2773015045687595, | |
| "kl": 0.00469970703125, | |
| "learning_rate": 4.6134903289633066e-07, | |
| "loss": 0.0505, | |
| "reward": 1.5636905431747437, | |
| "reward_std": 0.6416950821876526, | |
| "rewards/": 7.342262268066406, | |
| "rewards/math_compute_score": 0.1190476194024086, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1882.1905517578125, | |
| "epoch": 0.5417661097852029, | |
| "grad_norm": 0.2532317353993275, | |
| "kl": 0.0042724609375, | |
| "learning_rate": 4.574928490008264e-07, | |
| "loss": 0.0448, | |
| "reward": 1.1148065328598022, | |
| "reward_std": 0.6571987867355347, | |
| "rewards/": 6.526413917541504, | |
| "rewards/math_compute_score": -0.2380952388048172, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1814.761962890625, | |
| "epoch": 0.5441527446300716, | |
| "grad_norm": 0.22033332515712287, | |
| "kl": 0.0037384033203125, | |
| "learning_rate": 4.536392102208997e-07, | |
| "loss": 0.0088, | |
| "reward": 1.4956845045089722, | |
| "reward_std": 0.4954971373081207, | |
| "rewards/": 7.097470283508301, | |
| "rewards/math_compute_score": 0.095238097012043, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1382.0, | |
| "epoch": 0.5465393794749404, | |
| "grad_norm": 0.26858072308905145, | |
| "kl": 0.005218505859375, | |
| "learning_rate": 4.4978834729316376e-07, | |
| "loss": 0.0353, | |
| "reward": 1.7789063453674316, | |
| "reward_std": 0.5954192876815796, | |
| "rewards/": 6.227864742279053, | |
| "rewards/math_compute_score": 0.6666666865348816, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.548926014319809, | |
| "grad_norm": 0.3402534060613979, | |
| "learning_rate": 4.459404907880292e-07, | |
| "loss": 0.0897, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.548926014319809, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1686.4048156738281, | |
| "eval_kl": 0.00449371337890625, | |
| "eval_loss": 0.023935753852128983, | |
| "eval_reward": 1.7440082132816315, | |
| "eval_reward_std": 0.5679794251918793, | |
| "eval_rewards/": 6.767659664154053, | |
| "eval_rewards/math_compute_score": 0.48809525929391384, | |
| "eval_runtime": 89.8107, | |
| "eval_samples_per_second": 0.234, | |
| "eval_steps_per_second": 0.011, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1321.3928833007812, | |
| "epoch": 0.5513126491646778, | |
| "grad_norm": 0.242406498971476, | |
| "kl": 0.0046234130859375, | |
| "learning_rate": 4.420958710958956e-07, | |
| "loss": -0.0013, | |
| "reward": 1.9190662503242493, | |
| "reward_std": 0.3563975691795349, | |
| "rewards/": 6.73818826675415, | |
| "rewards/math_compute_score": 0.7142857313156128, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1509.857177734375, | |
| "epoch": 0.5536992840095465, | |
| "grad_norm": 0.27883626832213565, | |
| "kl": 0.0036468505859375, | |
| "learning_rate": 4.3825471841335924e-07, | |
| "loss": 0.0371, | |
| "reward": 1.6071429252624512, | |
| "reward_std": 0.4766731262207031, | |
| "rewards/": 6.226190567016602, | |
| "rewards/math_compute_score": 0.4523809552192688, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1675.666748046875, | |
| "epoch": 0.5560859188544153, | |
| "grad_norm": 0.31965378501684055, | |
| "kl": 0.00439453125, | |
| "learning_rate": 4.3441726272942884e-07, | |
| "loss": 0.0565, | |
| "reward": 1.8153274059295654, | |
| "reward_std": 0.6682077050209045, | |
| "rewards/": 6.981399059295654, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1410.6905517578125, | |
| "epoch": 0.5584725536992841, | |
| "grad_norm": 0.28553266771721203, | |
| "kl": 0.005126953125, | |
| "learning_rate": 4.305837338117557e-07, | |
| "loss": 0.0325, | |
| "reward": 1.7528274059295654, | |
| "reward_std": 0.553020715713501, | |
| "rewards/": 6.192708492279053, | |
| "rewards/math_compute_score": 0.6428571343421936, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1491.6905517578125, | |
| "epoch": 0.5608591885441527, | |
| "grad_norm": 0.28723777171974385, | |
| "kl": 0.004852294921875, | |
| "learning_rate": 4.267543611928754e-07, | |
| "loss": 0.0305, | |
| "reward": 2.1510417461395264, | |
| "reward_std": 0.48044469952583313, | |
| "rewards/": 7.326637268066406, | |
| "rewards/math_compute_score": 0.8571428656578064, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1592.452392578125, | |
| "epoch": 0.5632458233890215, | |
| "grad_norm": 0.25981926019228574, | |
| "kl": 0.00439453125, | |
| "learning_rate": 4.229293741564657e-07, | |
| "loss": 0.0434, | |
| "reward": 1.5803943872451782, | |
| "reward_std": 0.7352063655853271, | |
| "rewards/": 6.378162384033203, | |
| "rewards/math_compute_score": 0.380952388048172, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1855.166748046875, | |
| "epoch": 0.5656324582338902, | |
| "grad_norm": 0.24285139595600752, | |
| "kl": 0.005462646484375, | |
| "learning_rate": 4.1910900172361763e-07, | |
| "loss": 0.0093, | |
| "reward": 1.3181548118591309, | |
| "reward_std": 0.49610888957977295, | |
| "rewards/": 6.686011791229248, | |
| "rewards/math_compute_score": -0.02380952425301075, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1805.5714111328125, | |
| "epoch": 0.568019093078759, | |
| "grad_norm": 0.2538583793375119, | |
| "kl": 0.00537109375, | |
| "learning_rate": 4.1529347263912226e-07, | |
| "loss": 0.0095, | |
| "reward": 1.5097098350524902, | |
| "reward_std": 0.4408361315727234, | |
| "rewards/": 7.167596817016602, | |
| "rewards/math_compute_score": 0.095238097012043, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1417.1429443359375, | |
| "epoch": 0.5704057279236276, | |
| "grad_norm": 0.25434318216206253, | |
| "kl": 0.004608154296875, | |
| "learning_rate": 4.1148301535777587e-07, | |
| "loss": 0.0364, | |
| "reward": 2.183779716491699, | |
| "reward_std": 0.37863248586654663, | |
| "rewards/": 7.6808037757873535, | |
| "rewards/math_compute_score": 0.8095238208770752, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.5727923627684964, | |
| "grad_norm": 0.34238218294249706, | |
| "learning_rate": 4.076778580306999e-07, | |
| "loss": -0.0382, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5727923627684964, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1660.4345703125, | |
| "eval_kl": 0.00495147705078125, | |
| "eval_loss": 0.05178676173090935, | |
| "eval_reward": 1.8340391218662262, | |
| "eval_reward_std": 0.5649153962731361, | |
| "eval_rewards/": 6.813052415847778, | |
| "eval_rewards/math_compute_score": 0.5892857238650322, | |
| "eval_runtime": 89.4974, | |
| "eval_samples_per_second": 0.235, | |
| "eval_steps_per_second": 0.011, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1437.3095397949219, | |
| "epoch": 0.5751789976133651, | |
| "grad_norm": 0.23354633615170214, | |
| "kl": 0.0046844482421875, | |
| "learning_rate": 4.038782284916816e-07, | |
| "loss": 0.0166, | |
| "reward": 1.68543541431427, | |
| "reward_std": 0.4059063643217087, | |
| "rewards/": 6.998605012893677, | |
| "rewards/math_compute_score": 0.3571428656578064, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1925.2857666015625, | |
| "epoch": 0.5775656324582339, | |
| "grad_norm": 0.2565081546038113, | |
| "kl": 0.004547119140625, | |
| "learning_rate": 4.000843542435315e-07, | |
| "loss": 0.0438, | |
| "reward": 1.1633185148239136, | |
| "reward_std": 0.8174499273300171, | |
| "rewards/": 6.578496932983398, | |
| "rewards/math_compute_score": -0.190476194024086, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1422.4285888671875, | |
| "epoch": 0.5799522673031027, | |
| "grad_norm": 0.29336911783349, | |
| "kl": 0.005523681640625, | |
| "learning_rate": 3.962964624444625e-07, | |
| "loss": -0.0476, | |
| "reward": 1.511476993560791, | |
| "reward_std": 0.5816119909286499, | |
| "rewards/": 6.319289684295654, | |
| "rewards/math_compute_score": 0.3095238208770752, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1830.71435546875, | |
| "epoch": 0.5823389021479713, | |
| "grad_norm": 0.2567772027773301, | |
| "kl": 0.004638671875, | |
| "learning_rate": 3.9251477989448795e-07, | |
| "loss": 0.0276, | |
| "reward": 1.3412946462631226, | |
| "reward_std": 0.5368376970291138, | |
| "rewards/": 6.801711559295654, | |
| "rewards/math_compute_score": -0.02380952425301075, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1410.90478515625, | |
| "epoch": 0.5847255369928401, | |
| "grad_norm": 0.28779373915724527, | |
| "kl": 0.005950927734375, | |
| "learning_rate": 3.8873953302184283e-07, | |
| "loss": 0.0955, | |
| "reward": 2.0691964626312256, | |
| "reward_std": 0.4866236746311188, | |
| "rewards/": 7.107887268066406, | |
| "rewards/math_compute_score": 0.8095238208770752, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1704.1905517578125, | |
| "epoch": 0.5871121718377088, | |
| "grad_norm": 0.22883743400607778, | |
| "kl": 0.004180908203125, | |
| "learning_rate": 3.849709478694255e-07, | |
| "loss": -0.0272, | |
| "reward": 1.5712053775787354, | |
| "reward_std": 0.5555210709571838, | |
| "rewards/": 6.522693634033203, | |
| "rewards/math_compute_score": 0.3333333432674408, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1329.452392578125, | |
| "epoch": 0.5894988066825776, | |
| "grad_norm": 0.287861136892914, | |
| "kl": 0.00537109375, | |
| "learning_rate": 3.8120925008126454e-07, | |
| "loss": -0.0196, | |
| "reward": 1.8973217010498047, | |
| "reward_std": 0.2906087338924408, | |
| "rewards/": 7.296131134033203, | |
| "rewards/math_compute_score": 0.5476190447807312, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1574.7381591796875, | |
| "epoch": 0.5918854415274463, | |
| "grad_norm": 0.2670606131074928, | |
| "kl": 0.005279541015625, | |
| "learning_rate": 3.7745466488900657e-07, | |
| "loss": 0.0051, | |
| "reward": 1.6572545766830444, | |
| "reward_std": 0.5282899141311646, | |
| "rewards/": 6.762463092803955, | |
| "rewards/math_compute_score": 0.380952388048172, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1842.8333740234375, | |
| "epoch": 0.594272076372315, | |
| "grad_norm": 0.2512682576888719, | |
| "kl": 0.004180908203125, | |
| "learning_rate": 3.7370741709843256e-07, | |
| "loss": 0.0202, | |
| "reward": 1.1577380895614624, | |
| "reward_std": 0.4659985601902008, | |
| "rewards/": 6.741071701049805, | |
| "rewards/math_compute_score": -0.2380952388048172, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.5966587112171837, | |
| "grad_norm": 0.2857471624345884, | |
| "learning_rate": 3.69967731075996e-07, | |
| "loss": 0.0114, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5966587112171837, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1692.4345397949219, | |
| "eval_kl": 0.00485992431640625, | |
| "eval_loss": 0.017853517085313797, | |
| "eval_reward": 1.7054944038391113, | |
| "eval_reward_std": 0.5782012939453125, | |
| "eval_rewards/": 6.575090765953064, | |
| "eval_rewards/math_compute_score": 0.48809524066746235, | |
| "eval_runtime": 89.5769, | |
| "eval_samples_per_second": 0.234, | |
| "eval_steps_per_second": 0.011, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1610.3928833007812, | |
| "epoch": 0.5990453460620525, | |
| "grad_norm": 0.270009406471592, | |
| "kl": 0.004669189453125, | |
| "learning_rate": 3.6623583073538965e-07, | |
| "loss": 0.034, | |
| "reward": 1.5946521162986755, | |
| "reward_std": 0.5078227818012238, | |
| "rewards/": 6.449451208114624, | |
| "rewards/math_compute_score": 0.3809523954987526, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1729.0, | |
| "epoch": 0.6014319809069213, | |
| "grad_norm": 0.22329451936667, | |
| "kl": 0.0040283203125, | |
| "learning_rate": 3.625119395241386e-07, | |
| "loss": -0.0209, | |
| "reward": 1.681398868560791, | |
| "reward_std": 0.542913019657135, | |
| "rewards/": 7.168899059295654, | |
| "rewards/math_compute_score": 0.3095238208770752, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1432.4285888671875, | |
| "epoch": 0.60381861575179, | |
| "grad_norm": 0.2991176866926496, | |
| "kl": 0.00604248046875, | |
| "learning_rate": 3.5879628041022135e-07, | |
| "loss": 0.09, | |
| "reward": 2.0321431159973145, | |
| "reward_std": 0.5620549917221069, | |
| "rewards/": 7.684524059295654, | |
| "rewards/math_compute_score": 0.6190476417541504, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1317.0714111328125, | |
| "epoch": 0.6062052505966588, | |
| "grad_norm": 0.22409602196187103, | |
| "kl": 0.00433349609375, | |
| "learning_rate": 3.550890758687198e-07, | |
| "loss": -0.0005, | |
| "reward": 1.9829614162445068, | |
| "reward_std": 0.2656542956829071, | |
| "rewards/": 6.771949768066406, | |
| "rewards/math_compute_score": 0.785714328289032, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1662.4761962890625, | |
| "epoch": 0.6085918854415274, | |
| "grad_norm": 0.2603499504911004, | |
| "kl": 0.005218505859375, | |
| "learning_rate": 3.513905478684978e-07, | |
| "loss": 0.0429, | |
| "reward": 1.5799851417541504, | |
| "reward_std": 0.5886630415916443, | |
| "rewards/": 6.566592216491699, | |
| "rewards/math_compute_score": 0.3333333432674408, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1513.1429443359375, | |
| "epoch": 0.6109785202863962, | |
| "grad_norm": 0.2523855873281953, | |
| "kl": 0.005035400390625, | |
| "learning_rate": 3.47700917858912e-07, | |
| "loss": 0.0282, | |
| "reward": 1.9692708253860474, | |
| "reward_std": 0.30754101276397705, | |
| "rewards/": 7.370163917541504, | |
| "rewards/math_compute_score": 0.6190476417541504, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1599.357177734375, | |
| "epoch": 0.6133651551312649, | |
| "grad_norm": 0.2641504670239811, | |
| "kl": 0.005340576171875, | |
| "learning_rate": 3.440204067565511e-07, | |
| "loss": 0.0331, | |
| "reward": 1.9764137268066406, | |
| "reward_std": 0.4379138946533203, | |
| "rewards/": 7.024925708770752, | |
| "rewards/math_compute_score": 0.7142857313156128, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1846.5714111328125, | |
| "epoch": 0.6157517899761337, | |
| "grad_norm": 0.23594897144723076, | |
| "kl": 0.00494384765625, | |
| "learning_rate": 3.4034923493201007e-07, | |
| "loss": 0.0141, | |
| "reward": 1.4104912281036377, | |
| "reward_std": 0.7667174935340881, | |
| "rewards/": 6.385788917541504, | |
| "rewards/math_compute_score": 0.1666666716337204, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1564.8809814453125, | |
| "epoch": 0.6181384248210023, | |
| "grad_norm": 0.27947145598838724, | |
| "kl": 0.0057373046875, | |
| "learning_rate": 3.366876221966939e-07, | |
| "loss": 0.0397, | |
| "reward": 1.8209078311920166, | |
| "reward_std": 0.4953598976135254, | |
| "rewards/": 7.009300708770752, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.6205250596658711, | |
| "grad_norm": 0.24582056613485512, | |
| "learning_rate": 3.330357877896577e-07, | |
| "loss": -0.0136, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6205250596658711, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1679.5833435058594, | |
| "eval_kl": 0.005035400390625, | |
| "eval_loss": 0.019856100901961327, | |
| "eval_reward": 1.8213519155979156, | |
| "eval_reward_std": 0.5068388804793358, | |
| "eval_rewards/": 6.773425936698914, | |
| "eval_rewards/math_compute_score": 0.5833333469927311, | |
| "eval_runtime": 88.9934, | |
| "eval_samples_per_second": 0.236, | |
| "eval_steps_per_second": 0.011, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1454.7738647460938, | |
| "epoch": 0.6229116945107399, | |
| "grad_norm": 0.29179311342786673, | |
| "kl": 0.0057525634765625, | |
| "learning_rate": 3.2939395036447875e-07, | |
| "loss": 0.0543, | |
| "reward": 1.9630953073501587, | |
| "reward_std": 0.4546659290790558, | |
| "rewards/": 7.339285850524902, | |
| "rewards/math_compute_score": 0.6190476268529892, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1676.71435546875, | |
| "epoch": 0.6252983293556086, | |
| "grad_norm": 0.26950110968223, | |
| "kl": 0.00616455078125, | |
| "learning_rate": 3.2576232797616555e-07, | |
| "loss": 0.0363, | |
| "reward": 1.8032739162445068, | |
| "reward_std": 0.540711522102356, | |
| "rewards/": 6.6354169845581055, | |
| "rewards/math_compute_score": 0.5952380895614624, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1341.952392578125, | |
| "epoch": 0.6276849642004774, | |
| "grad_norm": 0.29281930142683044, | |
| "kl": 0.005645751953125, | |
| "learning_rate": 3.221411380681007e-07, | |
| "loss": 0.0499, | |
| "reward": 1.8809523582458496, | |
| "reward_std": 0.4906759262084961, | |
| "rewards/": 6.738095283508301, | |
| "rewards/math_compute_score": 0.6666666865348816, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1802.3095703125, | |
| "epoch": 0.630071599045346, | |
| "grad_norm": 0.23358354567385772, | |
| "kl": 0.004974365234375, | |
| "learning_rate": 3.1853059745902285e-07, | |
| "loss": 0.0414, | |
| "reward": 1.7011163234710693, | |
| "reward_std": 0.6680436134338379, | |
| "rewards/": 7.172246932983398, | |
| "rewards/math_compute_score": 0.3333333432674408, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1440.1905517578125, | |
| "epoch": 0.6324582338902148, | |
| "grad_norm": 0.22255436445316382, | |
| "kl": 0.004638671875, | |
| "learning_rate": 3.1493092233004277e-07, | |
| "loss": -0.0207, | |
| "reward": 2.206026792526245, | |
| "reward_std": 0.30084043741226196, | |
| "rewards/": 7.22061014175415, | |
| "rewards/math_compute_score": 0.9523809552192688, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1661.6429443359375, | |
| "epoch": 0.6348448687350835, | |
| "grad_norm": 0.24901623510825996, | |
| "kl": 0.005950927734375, | |
| "learning_rate": 3.1134232821170197e-07, | |
| "loss": 0.0692, | |
| "reward": 1.8057291507720947, | |
| "reward_std": 0.6432158946990967, | |
| "rewards/": 6.933407783508301, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1834.8333740234375, | |
| "epoch": 0.6372315035799523, | |
| "grad_norm": 0.2693858842162648, | |
| "kl": 0.00518798828125, | |
| "learning_rate": 3.0776502997106523e-07, | |
| "loss": 0.0186, | |
| "reward": 1.266369104385376, | |
| "reward_std": 0.5596219897270203, | |
| "rewards/": 7.09375, | |
| "rewards/math_compute_score": -0.190476194024086, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1539.6190185546875, | |
| "epoch": 0.639618138424821, | |
| "grad_norm": 0.33229466794859597, | |
| "kl": 0.0054931640625, | |
| "learning_rate": 3.0419924179885767e-07, | |
| "loss": 0.0968, | |
| "reward": 1.6295758485794067, | |
| "reward_std": 0.6508839130401611, | |
| "rewards/": 7.0050225257873535, | |
| "rewards/math_compute_score": 0.2857142984867096, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1653.7857666015625, | |
| "epoch": 0.6420047732696897, | |
| "grad_norm": 0.2518475987768706, | |
| "kl": 0.0059814453125, | |
| "learning_rate": 3.006451771966383e-07, | |
| "loss": 0.0212, | |
| "reward": 1.811290979385376, | |
| "reward_std": 0.7287031412124634, | |
| "rewards/": 7.627883434295654, | |
| "rewards/math_compute_score": 0.3571428656578064, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.6443914081145584, | |
| "grad_norm": 0.26996644513187057, | |
| "learning_rate": 2.97103048964018e-07, | |
| "loss": 0.0185, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6443914081145584, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1707.8691101074219, | |
| "eval_kl": 0.00543212890625, | |
| "eval_loss": 0.0052840313874185085, | |
| "eval_reward": 1.76309534907341, | |
| "eval_reward_std": 0.5642824694514275, | |
| "eval_rewards/": 6.74404776096344, | |
| "eval_rewards/math_compute_score": 0.5178571529686451, | |
| "eval_runtime": 104.2716, | |
| "eval_samples_per_second": 0.201, | |
| "eval_steps_per_second": 0.01, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1718.3095703125, | |
| "epoch": 0.6467780429594272, | |
| "grad_norm": 0.2669365684764653, | |
| "kl": 0.00506591796875, | |
| "learning_rate": 2.935730691859172e-07, | |
| "loss": 0.0291, | |
| "reward": 1.2227492928504944, | |
| "reward_std": 0.5440776199102402, | |
| "rewards/": 6.256603479385376, | |
| "rewards/math_compute_score": -0.0357142873108387, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1737.40478515625, | |
| "epoch": 0.649164677804296, | |
| "grad_norm": 0.27901330162610016, | |
| "kl": 0.00506591796875, | |
| "learning_rate": 2.900554492198677e-07, | |
| "loss": 0.064, | |
| "reward": 1.4799107313156128, | |
| "reward_std": 0.568335235118866, | |
| "rewards/": 7.018601417541504, | |
| "rewards/math_compute_score": 0.095238097012043, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1645.59521484375, | |
| "epoch": 0.6515513126491647, | |
| "grad_norm": 0.3047482282945327, | |
| "kl": 0.00592041015625, | |
| "learning_rate": 2.865503996833577e-07, | |
| "loss": 0.0249, | |
| "reward": 1.6728236675262451, | |
| "reward_std": 0.7209111452102661, | |
| "rewards/": 6.840309143066406, | |
| "rewards/math_compute_score": 0.380952388048172, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1540.6190185546875, | |
| "epoch": 0.6539379474940334, | |
| "grad_norm": 0.28896859955485643, | |
| "kl": 0.005584716796875, | |
| "learning_rate": 2.8305813044122093e-07, | |
| "loss": -0.0273, | |
| "reward": 1.7266185283660889, | |
| "reward_std": 0.47260189056396484, | |
| "rewards/": 6.633091449737549, | |
| "rewards/math_compute_score": 0.5, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1646.6190185546875, | |
| "epoch": 0.6563245823389021, | |
| "grad_norm": 0.29507920195415793, | |
| "kl": 0.006500244140625, | |
| "learning_rate": 2.7957885059307095e-07, | |
| "loss": 0.0727, | |
| "reward": 1.7315198183059692, | |
| "reward_std": 0.729472815990448, | |
| "rewards/": 6.562360763549805, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1560.9761962890625, | |
| "epoch": 0.6587112171837709, | |
| "grad_norm": 0.259247847289947, | |
| "kl": 0.0059814453125, | |
| "learning_rate": 2.761127684607811e-07, | |
| "loss": 0.0022, | |
| "reward": 1.774553656578064, | |
| "reward_std": 0.343158096075058, | |
| "rewards/": 7.348958492279053, | |
| "rewards/math_compute_score": 0.380952388048172, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1427.8333740234375, | |
| "epoch": 0.6610978520286396, | |
| "grad_norm": 0.33776370170761744, | |
| "kl": 0.00616455078125, | |
| "learning_rate": 2.7266009157601223e-07, | |
| "loss": 0.0646, | |
| "reward": 1.768303632736206, | |
| "reward_std": 0.5567624568939209, | |
| "rewards/": 6.936756134033203, | |
| "rewards/math_compute_score": 0.4761904776096344, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1616.3333740234375, | |
| "epoch": 0.6634844868735084, | |
| "grad_norm": 0.24395736200501855, | |
| "kl": 0.005279541015625, | |
| "learning_rate": 2.6922102666778546e-07, | |
| "loss": 0.0763, | |
| "reward": 1.7422620058059692, | |
| "reward_std": 0.58295738697052, | |
| "rewards/": 7.282738208770752, | |
| "rewards/math_compute_score": 0.3571428656578064, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1870.666748046875, | |
| "epoch": 0.665871121718377, | |
| "grad_norm": 0.2711602572939092, | |
| "kl": 0.005645751953125, | |
| "learning_rate": 2.65795779650105e-07, | |
| "loss": 0.0126, | |
| "reward": 1.3323661088943481, | |
| "reward_std": 0.6261304616928101, | |
| "rewards/": 6.852306842803955, | |
| "rewards/math_compute_score": -0.0476190485060215, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.6682577565632458, | |
| "grad_norm": 0.2538680866763002, | |
| "learning_rate": 2.623845556096288e-07, | |
| "loss": 0.0018, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6682577565632458, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1675.9226684570312, | |
| "eval_kl": 0.0055694580078125, | |
| "eval_loss": 0.021293407306075096, | |
| "eval_reward": 1.7486564218997955, | |
| "eval_reward_std": 0.4653979241847992, | |
| "eval_rewards/": 6.719472408294678, | |
| "eval_rewards/math_compute_score": 0.5059523964300752, | |
| "eval_runtime": 89.391, | |
| "eval_samples_per_second": 0.235, | |
| "eval_steps_per_second": 0.011, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1709.9761962890625, | |
| "epoch": 0.6706443914081146, | |
| "grad_norm": 0.3114946337877643, | |
| "kl": 0.0066375732421875, | |
| "learning_rate": 2.589875587933892e-07, | |
| "loss": 0.0322, | |
| "reward": 1.6709263324737549, | |
| "reward_std": 0.6808830201625824, | |
| "rewards/": 7.164155721664429, | |
| "rewards/math_compute_score": 0.2976190522313118, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1118.0, | |
| "epoch": 0.6730310262529833, | |
| "grad_norm": 0.28538793372981025, | |
| "kl": 0.0059814453125, | |
| "learning_rate": 2.5560499259656323e-07, | |
| "loss": 0.0625, | |
| "reward": 2.193861961364746, | |
| "reward_std": 0.3232349455356598, | |
| "rewards/": 6.969308376312256, | |
| "rewards/math_compute_score": 1.0, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1450.7381591796875, | |
| "epoch": 0.6754176610978521, | |
| "grad_norm": 0.2721947819426561, | |
| "kl": 0.006103515625, | |
| "learning_rate": 2.522370595502954e-07, | |
| "loss": 0.0198, | |
| "reward": 1.9322172403335571, | |
| "reward_std": 0.42703866958618164, | |
| "rewards/": 6.803943634033203, | |
| "rewards/math_compute_score": 0.7142857313156128, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1785.21435546875, | |
| "epoch": 0.6778042959427207, | |
| "grad_norm": 0.2394368126047437, | |
| "kl": 0.006134033203125, | |
| "learning_rate": 2.4888396130956943e-07, | |
| "loss": 0.0365, | |
| "reward": 1.4974703788757324, | |
| "reward_std": 0.7022044658660889, | |
| "rewards/": 6.91592264175415, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1556.40478515625, | |
| "epoch": 0.6801909307875895, | |
| "grad_norm": 0.28935547227648567, | |
| "kl": 0.005462646484375, | |
| "learning_rate": 2.455458986411356e-07, | |
| "loss": 0.0616, | |
| "reward": 1.8514881134033203, | |
| "reward_std": 0.4598250687122345, | |
| "rewards/": 6.971726417541504, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1276.2857666015625, | |
| "epoch": 0.6825775656324582, | |
| "grad_norm": 0.30697866631475274, | |
| "kl": 0.00701904296875, | |
| "learning_rate": 2.4222307141148906e-07, | |
| "loss": -0.0098, | |
| "reward": 1.4965217113494873, | |
| "reward_std": 0.5237927436828613, | |
| "rewards/": 6.054036617279053, | |
| "rewards/math_compute_score": 0.3571428656578064, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1645.90478515625, | |
| "epoch": 0.684964200477327, | |
| "grad_norm": 0.2560352818812391, | |
| "kl": 0.004364013671875, | |
| "learning_rate": 2.3891567857490367e-07, | |
| "loss": 0.0384, | |
| "reward": 1.592038631439209, | |
| "reward_std": 0.7362250685691833, | |
| "rewards/": 5.864955425262451, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1456.261962890625, | |
| "epoch": 0.6873508353221957, | |
| "grad_norm": 0.2542182384142676, | |
| "kl": 0.005615234375, | |
| "learning_rate": 2.3562391816151805e-07, | |
| "loss": 0.0389, | |
| "reward": 1.9461311101913452, | |
| "reward_std": 0.38350486755371094, | |
| "rewards/": 7.444940567016602, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1732.4761962890625, | |
| "epoch": 0.6897374701670644, | |
| "grad_norm": 0.2957765896159597, | |
| "kl": 0.0059814453125, | |
| "learning_rate": 2.3234798726548044e-07, | |
| "loss": 0.0439, | |
| "reward": 1.5767114162445068, | |
| "reward_std": 0.6308966279029846, | |
| "rewards/": 6.8359375, | |
| "rewards/math_compute_score": 0.261904776096344, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.6921241050119332, | |
| "grad_norm": 0.24595123987891096, | |
| "learning_rate": 2.2908808203314633e-07, | |
| "loss": 0.0576, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6921241050119332, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1689.4702758789062, | |
| "eval_kl": 0.00591278076171875, | |
| "eval_loss": 0.024828355759382248, | |
| "eval_reward": 1.7160040140151978, | |
| "eval_reward_std": 0.5452041104435921, | |
| "eval_rewards/": 6.651448607444763, | |
| "eval_rewards/math_compute_score": 0.48214287124574184, | |
| "eval_runtime": 89.5372, | |
| "eval_samples_per_second": 0.235, | |
| "eval_steps_per_second": 0.011, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1495.4642944335938, | |
| "epoch": 0.6945107398568019, | |
| "grad_norm": 0.30325016746870037, | |
| "kl": 0.005462646484375, | |
| "learning_rate": 2.258443976513345e-07, | |
| "loss": 0.0171, | |
| "reward": 1.9273810386657715, | |
| "reward_std": 0.39634837210178375, | |
| "rewards/": 7.208333492279053, | |
| "rewards/math_compute_score": 0.6071428656578064, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1870.71435546875, | |
| "epoch": 0.6968973747016707, | |
| "grad_norm": 0.24295700768116915, | |
| "kl": 0.0052490234375, | |
| "learning_rate": 2.2261712833564088e-07, | |
| "loss": 0.045, | |
| "reward": 1.1304640769958496, | |
| "reward_std": 0.6737284064292908, | |
| "rewards/": 6.3189873695373535, | |
| "rewards/math_compute_score": -0.1666666716337204, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1458.2381591796875, | |
| "epoch": 0.6992840095465394, | |
| "grad_norm": 0.2770843906605435, | |
| "kl": 0.006378173828125, | |
| "learning_rate": 2.1940646731880885e-07, | |
| "loss": 0.006, | |
| "reward": 1.776116132736206, | |
| "reward_std": 0.45735234022140503, | |
| "rewards/": 6.975818634033203, | |
| "rewards/math_compute_score": 0.4761904776096344, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1907.047607421875, | |
| "epoch": 0.7016706443914081, | |
| "grad_norm": 0.27407827255457246, | |
| "kl": 0.0057373046875, | |
| "learning_rate": 2.1621260683916005e-07, | |
| "loss": 0.0246, | |
| "reward": 1.1069941520690918, | |
| "reward_std": 0.706874430179596, | |
| "rewards/": 6.487351417541504, | |
| "rewards/math_compute_score": -0.2380952388048172, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1427.8809814453125, | |
| "epoch": 0.7040572792362768, | |
| "grad_norm": 0.2531898073661023, | |
| "kl": 0.0052490234375, | |
| "learning_rate": 2.1303573812908383e-07, | |
| "loss": 0.0217, | |
| "reward": 1.809449315071106, | |
| "reward_std": 0.3774469792842865, | |
| "rewards/": 7.332961559295654, | |
| "rewards/math_compute_score": 0.4285714328289032, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1773.59521484375, | |
| "epoch": 0.7064439140811456, | |
| "grad_norm": 0.2651426374809382, | |
| "kl": 0.005706787109375, | |
| "learning_rate": 2.0987605140358822e-07, | |
| "loss": -0.0014, | |
| "reward": 1.1260230541229248, | |
| "reward_std": 0.657254159450531, | |
| "rewards/": 5.915829658508301, | |
| "rewards/math_compute_score": -0.0714285746216774, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1461.40478515625, | |
| "epoch": 0.7088305489260143, | |
| "grad_norm": 0.25548482546819784, | |
| "kl": 0.00592041015625, | |
| "learning_rate": 2.0673373584890846e-07, | |
| "loss": 0.0298, | |
| "reward": 2.0536458492279053, | |
| "reward_std": 0.39229947328567505, | |
| "rewards/": 7.030134201049805, | |
| "rewards/math_compute_score": 0.8095238208770752, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1537.547607421875, | |
| "epoch": 0.711217183770883, | |
| "grad_norm": 0.28766204010964963, | |
| "kl": 0.00634765625, | |
| "learning_rate": 2.0360897961118246e-07, | |
| "loss": 0.0335, | |
| "reward": 1.798586368560791, | |
| "reward_std": 0.5106884241104126, | |
| "rewards/": 7.183407783508301, | |
| "rewards/math_compute_score": 0.4523809552192688, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1209.4285888671875, | |
| "epoch": 0.7136038186157518, | |
| "grad_norm": 0.2739533568578717, | |
| "kl": 0.006439208984375, | |
| "learning_rate": 2.0050196978518318e-07, | |
| "loss": 0.0196, | |
| "reward": 2.1623237133026123, | |
| "reward_std": 0.330209881067276, | |
| "rewards/": 7.002094745635986, | |
| "rewards/math_compute_score": 0.9523809552192688, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.7159904534606205, | |
| "grad_norm": 0.28060465982015764, | |
| "learning_rate": 1.9741289240311754e-07, | |
| "loss": 0.0362, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7159904534606205, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1636.3095397949219, | |
| "eval_kl": 0.0059051513671875, | |
| "eval_loss": 0.04656996577978134, | |
| "eval_reward": 1.842539221048355, | |
| "eval_reward_std": 0.5257963240146637, | |
| "eval_rewards/": 6.712695956230164, | |
| "eval_rewards/math_compute_score": 0.6250000149011612, | |
| "eval_runtime": 88.3841, | |
| "eval_samples_per_second": 0.238, | |
| "eval_steps_per_second": 0.011, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1566.7381591796875, | |
| "epoch": 0.7183770883054893, | |
| "grad_norm": 0.2791601521311669, | |
| "kl": 0.00567626953125, | |
| "learning_rate": 1.9434193242348706e-07, | |
| "loss": 0.0155, | |
| "reward": 1.7760975360870361, | |
| "reward_std": 0.5548774749040604, | |
| "rewards/": 7.070963621139526, | |
| "rewards/math_compute_score": 0.45238097012043, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1478.6190185546875, | |
| "epoch": 0.720763723150358, | |
| "grad_norm": 0.2767593330017176, | |
| "kl": 0.005950927734375, | |
| "learning_rate": 1.9128927372001453e-07, | |
| "loss": -0.0091, | |
| "reward": 2.0397322177886963, | |
| "reward_std": 0.43883827328681946, | |
| "rewards/": 7.436756134033203, | |
| "rewards/math_compute_score": 0.6904761791229248, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1737.261962890625, | |
| "epoch": 0.7231503579952268, | |
| "grad_norm": 0.277582250530735, | |
| "kl": 0.0068359375, | |
| "learning_rate": 1.8825509907063326e-07, | |
| "loss": 0.0513, | |
| "reward": 1.3215030431747437, | |
| "reward_std": 0.6728135347366333, | |
| "rewards/": 6.2265625, | |
| "rewards/math_compute_score": 0.095238097012043, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1751.5238037109375, | |
| "epoch": 0.7255369928400954, | |
| "grad_norm": 0.2462323539587146, | |
| "kl": 0.005096435546875, | |
| "learning_rate": 1.8523959014654406e-07, | |
| "loss": 0.0345, | |
| "reward": 1.5617932081222534, | |
| "reward_std": 0.7247648239135742, | |
| "rewards/": 6.8565850257873535, | |
| "rewards/math_compute_score": 0.2380952388048172, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1656.761962890625, | |
| "epoch": 0.7279236276849642, | |
| "grad_norm": 0.2560096521866889, | |
| "kl": 0.006744384765625, | |
| "learning_rate": 1.822429275013374e-07, | |
| "loss": -0.0002, | |
| "reward": 1.9063987731933594, | |
| "reward_std": 0.5774410367012024, | |
| "rewards/": 7.1510419845581055, | |
| "rewards/math_compute_score": 0.5952380895614624, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1543.761962890625, | |
| "epoch": 0.7303102625298329, | |
| "grad_norm": 0.25382406268426466, | |
| "kl": 0.00506591796875, | |
| "learning_rate": 1.7926529056018297e-07, | |
| "loss": 0.0603, | |
| "reward": 1.8665552139282227, | |
| "reward_std": 0.36209362745285034, | |
| "rewards/": 7.047060966491699, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1739.8333740234375, | |
| "epoch": 0.7326968973747017, | |
| "grad_norm": 0.2836473746570401, | |
| "kl": 0.006866455078125, | |
| "learning_rate": 1.763068576090862e-07, | |
| "loss": 0.0395, | |
| "reward": 1.5139509439468384, | |
| "reward_std": 0.6646405458450317, | |
| "rewards/": 6.712611675262451, | |
| "rewards/math_compute_score": 0.2142857164144516, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1637.8809814453125, | |
| "epoch": 0.7350835322195705, | |
| "grad_norm": 0.2569474942305258, | |
| "kl": 0.006317138671875, | |
| "learning_rate": 1.7336780578421418e-07, | |
| "loss": 0.0301, | |
| "reward": 1.4991816282272339, | |
| "reward_std": 0.4310193359851837, | |
| "rewards/": 6.9244794845581055, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1488.5, | |
| "epoch": 0.7374701670644391, | |
| "grad_norm": 0.3024850487650047, | |
| "kl": 0.0068359375, | |
| "learning_rate": 1.7044831106128864e-07, | |
| "loss": -0.0418, | |
| "reward": 1.8633928298950195, | |
| "reward_std": 0.3402189016342163, | |
| "rewards/": 7.03125, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.7398568019093079, | |
| "grad_norm": 0.28358189812459517, | |
| "learning_rate": 1.6754854824504988e-07, | |
| "loss": 0.0584, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7398568019093079, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1617.6310119628906, | |
| "eval_kl": 0.00592803955078125, | |
| "eval_loss": 0.036280252039432526, | |
| "eval_reward": 1.829003930091858, | |
| "eval_reward_std": 0.5586381033062935, | |
| "eval_rewards/": 6.859305262565613, | |
| "eval_rewards/math_compute_score": 0.5714285746216774, | |
| "eval_runtime": 87.8182, | |
| "eval_samples_per_second": 0.239, | |
| "eval_steps_per_second": 0.011, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1745.5119018554688, | |
| "epoch": 0.7422434367541766, | |
| "grad_norm": 0.23985256079431821, | |
| "kl": 0.006500244140625, | |
| "learning_rate": 1.6466869095879076e-07, | |
| "loss": 0.0432, | |
| "reward": 1.5979260206222534, | |
| "reward_std": 0.6946238726377487, | |
| "rewards/": 6.942010879516602, | |
| "rewards/math_compute_score": 0.261904776096344, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1562.452392578125, | |
| "epoch": 0.7446300715990454, | |
| "grad_norm": 0.301641383304508, | |
| "kl": 0.00604248046875, | |
| "learning_rate": 1.618089116339601e-07, | |
| "loss": 0.074, | |
| "reward": 1.9055060148239136, | |
| "reward_std": 0.48013433814048767, | |
| "rewards/": 7.527529716491699, | |
| "rewards/math_compute_score": 0.5, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1563.0238037109375, | |
| "epoch": 0.747016706443914, | |
| "grad_norm": 0.2821395913132352, | |
| "kl": 0.005889892578125, | |
| "learning_rate": 1.5896938149983907e-07, | |
| "loss": 0.0421, | |
| "reward": 1.8350447416305542, | |
| "reward_std": 0.41543343663215637, | |
| "rewards/": 6.889509201049805, | |
| "rewards/math_compute_score": 0.5714285969734192, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1611.166748046875, | |
| "epoch": 0.7494033412887828, | |
| "grad_norm": 0.28629589377089504, | |
| "kl": 0.006561279296875, | |
| "learning_rate": 1.561502705732883e-07, | |
| "loss": 0.0339, | |
| "reward": 1.900520920753479, | |
| "reward_std": 0.5674861669540405, | |
| "rewards/": 7.788318634033203, | |
| "rewards/math_compute_score": 0.4285714328289032, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1294.2857666015625, | |
| "epoch": 0.7517899761336515, | |
| "grad_norm": 0.29565718515676304, | |
| "kl": 0.006195068359375, | |
| "learning_rate": 1.5335174764856907e-07, | |
| "loss": 0.0469, | |
| "reward": 2.021242380142212, | |
| "reward_std": 0.4866698682308197, | |
| "rewards/": 7.05859375, | |
| "rewards/math_compute_score": 0.761904776096344, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1552.3333740234375, | |
| "epoch": 0.7541766109785203, | |
| "grad_norm": 0.2699783744211519, | |
| "kl": 0.00732421875, | |
| "learning_rate": 1.505739802872351e-07, | |
| "loss": -0.0036, | |
| "reward": 1.7238560914993286, | |
| "reward_std": 0.5615432262420654, | |
| "rewards/": 6.809756278991699, | |
| "rewards/math_compute_score": 0.4523809552192688, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1412.3095703125, | |
| "epoch": 0.7565632458233891, | |
| "grad_norm": 0.25544319193529125, | |
| "kl": 0.005828857421875, | |
| "learning_rate": 1.4781713480810182e-07, | |
| "loss": 0.0326, | |
| "reward": 2.0182292461395264, | |
| "reward_std": 0.21248388290405273, | |
| "rewards/": 7.4244794845581055, | |
| "rewards/math_compute_score": 0.6666666865348816, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1879.5238037109375, | |
| "epoch": 0.7589498806682577, | |
| "grad_norm": 0.2858571715960386, | |
| "kl": 0.00732421875, | |
| "learning_rate": 1.4508137627728628e-07, | |
| "loss": 0.0278, | |
| "reward": 1.18154776096344, | |
| "reward_std": 0.6407575607299805, | |
| "rewards/": 6.574404716491699, | |
| "rewards/math_compute_score": -0.1666666716337204, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1535.09521484375, | |
| "epoch": 0.7613365155131265, | |
| "grad_norm": 0.2434413985432686, | |
| "kl": 0.00579833984375, | |
| "learning_rate": 1.4236686849832496e-07, | |
| "loss": 0.0309, | |
| "reward": 1.8485863208770752, | |
| "reward_std": 0.5873944759368896, | |
| "rewards/": 7.147693634033203, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.7637231503579952, | |
| "grad_norm": 0.2696031278018177, | |
| "learning_rate": 1.3967377400236514e-07, | |
| "loss": 0.0202, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7637231503579952, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1639.8214721679688, | |
| "eval_kl": 0.00627899169921875, | |
| "eval_loss": 0.04983534291386604, | |
| "eval_reward": 1.8444220423698425, | |
| "eval_reward_std": 0.505550891160965, | |
| "eval_rewards/": 6.960205316543579, | |
| "eval_rewards/math_compute_score": 0.5654762051999569, | |
| "eval_runtime": 88.8251, | |
| "eval_samples_per_second": 0.236, | |
| "eval_steps_per_second": 0.011, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1541.7142944335938, | |
| "epoch": 0.766109785202864, | |
| "grad_norm": 0.23918547245411584, | |
| "kl": 0.0065765380859375, | |
| "learning_rate": 1.370022540384347e-07, | |
| "loss": 0.0411, | |
| "reward": 1.8474704027175903, | |
| "reward_std": 0.53986856341362, | |
| "rewards/": 7.046875, | |
| "rewards/math_compute_score": 0.5476190522313118, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1663.0, | |
| "epoch": 0.7684964200477327, | |
| "grad_norm": 0.26591156051162185, | |
| "kl": 0.00579833984375, | |
| "learning_rate": 1.3435246856378525e-07, | |
| "loss": -0.0037, | |
| "reward": 1.948484182357788, | |
| "reward_std": 0.6001127362251282, | |
| "rewards/": 7.17099142074585, | |
| "rewards/math_compute_score": 0.6428571343421936, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1639.1905517578125, | |
| "epoch": 0.7708830548926014, | |
| "grad_norm": 0.27667895800229375, | |
| "kl": 0.00653076171875, | |
| "learning_rate": 1.3172457623431705e-07, | |
| "loss": 0.0247, | |
| "reward": 1.505738377571106, | |
| "reward_std": 0.7357364892959595, | |
| "rewards/": 6.766787528991699, | |
| "rewards/math_compute_score": 0.190476194024086, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1452.0, | |
| "epoch": 0.7732696897374701, | |
| "grad_norm": 0.2607746555497128, | |
| "kl": 0.005889892578125, | |
| "learning_rate": 1.2911873439507765e-07, | |
| "loss": 0.0434, | |
| "reward": 1.8965216875076294, | |
| "reward_std": 0.5311101675033569, | |
| "rewards/": 6.33975076675415, | |
| "rewards/math_compute_score": 0.785714328289032, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1695.857177734375, | |
| "epoch": 0.7756563245823389, | |
| "grad_norm": 0.2459311422515735, | |
| "kl": 0.005828857421875, | |
| "learning_rate": 1.265350990708417e-07, | |
| "loss": 0.0237, | |
| "reward": 1.4831101894378662, | |
| "reward_std": 0.5591450333595276, | |
| "rewards/": 6.653645992279053, | |
| "rewards/math_compute_score": 0.190476194024086, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1460.8333740234375, | |
| "epoch": 0.7780429594272077, | |
| "grad_norm": 0.27149132930004455, | |
| "kl": 0.006072998046875, | |
| "learning_rate": 1.2397382495676873e-07, | |
| "loss": 0.0584, | |
| "reward": 1.9678572416305542, | |
| "reward_std": 0.5532270073890686, | |
| "rewards/": 7.077381134033203, | |
| "rewards/math_compute_score": 0.6904761791229248, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1638.9285888671875, | |
| "epoch": 0.7804295942720764, | |
| "grad_norm": 0.25969805069075325, | |
| "kl": 0.00555419921875, | |
| "learning_rate": 1.214350654091413e-07, | |
| "loss": 0.0538, | |
| "reward": 1.7508186101913452, | |
| "reward_std": 0.6165412068367004, | |
| "rewards/": 6.944568634033203, | |
| "rewards/math_compute_score": 0.4523809552192688, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1443.452392578125, | |
| "epoch": 0.7828162291169452, | |
| "grad_norm": 0.24646262050218948, | |
| "kl": 0.006866455078125, | |
| "learning_rate": 1.1891897243618183e-07, | |
| "loss": -0.0158, | |
| "reward": 2.0401785373687744, | |
| "reward_std": 0.37470337748527527, | |
| "rewards/": 7.248512268066406, | |
| "rewards/math_compute_score": 0.738095223903656, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1417.7857666015625, | |
| "epoch": 0.7852028639618138, | |
| "grad_norm": 0.3456800505744401, | |
| "kl": 0.0074462890625, | |
| "learning_rate": 1.1642569668895169e-07, | |
| "loss": 0.1143, | |
| "reward": 1.848995566368103, | |
| "reward_std": 0.655685305595398, | |
| "rewards/": 7.244977951049805, | |
| "rewards/math_compute_score": 0.5, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.7875894988066826, | |
| "grad_norm": 0.2836166409784225, | |
| "learning_rate": 1.139553874523313e-07, | |
| "loss": 0.0474, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7875894988066826, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1675.7380981445312, | |
| "eval_kl": 0.0064849853515625, | |
| "eval_loss": 0.037556298077106476, | |
| "eval_reward": 1.8106631338596344, | |
| "eval_reward_std": 0.45607033371925354, | |
| "eval_rewards/": 6.7676016092300415, | |
| "eval_rewards/math_compute_score": 0.5714285839349031, | |
| "eval_runtime": 89.131, | |
| "eval_samples_per_second": 0.236, | |
| "eval_steps_per_second": 0.011, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1613.4286499023438, | |
| "epoch": 0.7899761336515513, | |
| "grad_norm": 0.27172662798968367, | |
| "kl": 0.006988525390625, | |
| "learning_rate": 1.1150819263608097e-07, | |
| "loss": -0.0121, | |
| "reward": 1.7162761092185974, | |
| "reward_std": 0.4672919511795044, | |
| "rewards/": 7.676618576049805, | |
| "rewards/math_compute_score": 0.226190485060215, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1452.0238037109375, | |
| "epoch": 0.7923627684964201, | |
| "grad_norm": 0.29870503341388466, | |
| "kl": 0.007415771484375, | |
| "learning_rate": 1.090842587659851e-07, | |
| "loss": 0.034, | |
| "reward": 2.050297737121582, | |
| "reward_std": 0.4812738299369812, | |
| "rewards/": 7.299107074737549, | |
| "rewards/math_compute_score": 0.738095223903656, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1663.761962890625, | |
| "epoch": 0.7947494033412887, | |
| "grad_norm": 0.23048288658260976, | |
| "kl": 0.005828857421875, | |
| "learning_rate": 1.0668373097507921e-07, | |
| "loss": 0.0221, | |
| "reward": 1.6446057558059692, | |
| "reward_std": 0.6130920052528381, | |
| "rewards/": 6.889695167541504, | |
| "rewards/math_compute_score": 0.3333333432674408, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1342.952392578125, | |
| "epoch": 0.7971360381861575, | |
| "grad_norm": 0.2658342486141691, | |
| "kl": 0.00714111328125, | |
| "learning_rate": 1.0430675299495973e-07, | |
| "loss": -0.0088, | |
| "reward": 1.744512677192688, | |
| "reward_std": 0.5626569986343384, | |
| "rewards/": 6.246372699737549, | |
| "rewards/math_compute_score": 0.6190476417541504, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1467.0, | |
| "epoch": 0.7995226730310262, | |
| "grad_norm": 0.23546913033927908, | |
| "kl": 0.00677490234375, | |
| "learning_rate": 1.0195346714717812e-07, | |
| "loss": -0.0662, | |
| "reward": 1.8030506372451782, | |
| "reward_std": 0.6685277223587036, | |
| "rewards/": 6.824777126312256, | |
| "rewards/math_compute_score": 0.5476190447807312, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1890.9285888671875, | |
| "epoch": 0.801909307875895, | |
| "grad_norm": 0.24627575755994274, | |
| "kl": 0.0064697265625, | |
| "learning_rate": 9.962401433471984e-08, | |
| "loss": 0.016, | |
| "reward": 1.0757441520690918, | |
| "reward_std": 0.5567899346351624, | |
| "rewards/": 6.140625, | |
| "rewards/math_compute_score": -0.190476194024086, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1682.5, | |
| "epoch": 0.8042959427207638, | |
| "grad_norm": 0.2843068676706687, | |
| "kl": 0.00732421875, | |
| "learning_rate": 9.731853403356705e-08, | |
| "loss": 0.0064, | |
| "reward": 1.4380580186843872, | |
| "reward_std": 0.3497306704521179, | |
| "rewards/": 6.047433376312256, | |
| "rewards/math_compute_score": 0.2857142984867096, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1401.642822265625, | |
| "epoch": 0.8066825775656324, | |
| "grad_norm": 0.29907908692795154, | |
| "kl": 0.007476806640625, | |
| "learning_rate": 9.503716428434799e-08, | |
| "loss": -0.0348, | |
| "reward": 1.8360120058059692, | |
| "reward_std": 0.4471094012260437, | |
| "rewards/": 6.989583492279053, | |
| "rewards/math_compute_score": 0.5476190447807312, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1525.6190185546875, | |
| "epoch": 0.8090692124105012, | |
| "grad_norm": 0.28235655011179184, | |
| "kl": 0.00653076171875, | |
| "learning_rate": 9.27800416840715e-08, | |
| "loss": 0.0183, | |
| "reward": 2.1017115116119385, | |
| "reward_std": 0.5442370176315308, | |
| "rewards/": 7.4609375, | |
| "rewards/math_compute_score": 0.761904776096344, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.8114558472553699, | |
| "grad_norm": 0.23832517340240178, | |
| "learning_rate": 9.054730137794886e-08, | |
| "loss": 0.014, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8114558472553699, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1661.107177734375, | |
| "eval_kl": 0.00655364990234375, | |
| "eval_loss": 0.023055095225572586, | |
| "eval_reward": 1.863730102777481, | |
| "eval_reward_std": 0.5041995421051979, | |
| "eval_rewards/": 6.818650126457214, | |
| "eval_rewards/math_compute_score": 0.6250000149011612, | |
| "eval_runtime": 89.1847, | |
| "eval_samples_per_second": 0.235, | |
| "eval_steps_per_second": 0.011, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1674.6666870117188, | |
| "epoch": 0.8138424821002387, | |
| "grad_norm": 0.2632361321457208, | |
| "kl": 0.00665283203125, | |
| "learning_rate": 8.833907705130089e-08, | |
| "loss": 0.0161, | |
| "reward": 1.6610864400863647, | |
| "reward_std": 0.4744073450565338, | |
| "rewards/": 6.8292412757873535, | |
| "rewards/math_compute_score": 0.3690476194024086, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1667.90478515625, | |
| "epoch": 0.8162291169451074, | |
| "grad_norm": 0.2531099565121514, | |
| "kl": 0.00653076171875, | |
| "learning_rate": 8.615550092155477e-08, | |
| "loss": 0.0291, | |
| "reward": 1.6218007802963257, | |
| "reward_std": 0.5996249318122864, | |
| "rewards/": 6.680431842803955, | |
| "rewards/math_compute_score": 0.3571428656578064, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1443.9761962890625, | |
| "epoch": 0.8186157517899761, | |
| "grad_norm": 0.3708436064315245, | |
| "kl": 0.007537841796875, | |
| "learning_rate": 8.399670373032663e-08, | |
| "loss": 0.0949, | |
| "reward": 2.0575146675109863, | |
| "reward_std": 0.7847402691841125, | |
| "rewards/": 7.906621932983398, | |
| "rewards/math_compute_score": 0.5952380895614624, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1706.9285888671875, | |
| "epoch": 0.8210023866348448, | |
| "grad_norm": 0.2377499384119013, | |
| "kl": 0.006683349609375, | |
| "learning_rate": 8.186281473559381e-08, | |
| "loss": 0.0243, | |
| "reward": 2.0348215103149414, | |
| "reward_std": 0.32591211795806885, | |
| "rewards/": 7.316964626312256, | |
| "rewards/math_compute_score": 0.7142857313156128, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1619.357177734375, | |
| "epoch": 0.8233890214797136, | |
| "grad_norm": 0.26691977510844433, | |
| "kl": 0.00634765625, | |
| "learning_rate": 7.97539617039552e-08, | |
| "loss": 0.0268, | |
| "reward": 1.8893601894378662, | |
| "reward_std": 0.6046797633171082, | |
| "rewards/": 7.065848350524902, | |
| "rewards/math_compute_score": 0.5952380895614624, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1332.452392578125, | |
| "epoch": 0.8257756563245824, | |
| "grad_norm": 0.26196875577278594, | |
| "kl": 0.006988525390625, | |
| "learning_rate": 7.767027090298206e-08, | |
| "loss": 0.0965, | |
| "reward": 2.3058035373687744, | |
| "reward_std": 0.30486685037612915, | |
| "rewards/": 7.909970283508301, | |
| "rewards/math_compute_score": 0.9047619104385376, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1640.8809814453125, | |
| "epoch": 0.8281622911694511, | |
| "grad_norm": 0.3180196339273012, | |
| "kl": 0.00750732421875, | |
| "learning_rate": 7.561186709365652e-08, | |
| "loss": 0.0582, | |
| "reward": 1.2651599645614624, | |
| "reward_std": 0.5785154700279236, | |
| "rewards/": 6.4210381507873535, | |
| "rewards/math_compute_score": -0.02380952425301075, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1549.4285888671875, | |
| "epoch": 0.8305489260143198, | |
| "grad_norm": 0.23148361007108187, | |
| "kl": 0.00537109375, | |
| "learning_rate": 7.357887352290227e-08, | |
| "loss": -0.0128, | |
| "reward": 1.6349704265594482, | |
| "reward_std": 0.49700257182121277, | |
| "rewards/": 6.936756134033203, | |
| "rewards/math_compute_score": 0.3095238208770752, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1679.8095703125, | |
| "epoch": 0.8329355608591885, | |
| "grad_norm": 0.25788580600104904, | |
| "kl": 0.006622314453125, | |
| "learning_rate": 7.157141191620548e-08, | |
| "loss": 0.0082, | |
| "reward": 1.3982887268066406, | |
| "reward_std": 0.604521632194519, | |
| "rewards/": 6.42001485824585, | |
| "rewards/math_compute_score": 0.1428571492433548, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.8353221957040573, | |
| "grad_norm": 0.30282291813716555, | |
| "learning_rate": 6.958960247032513e-08, | |
| "loss": 0.0621, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8353221957040573, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1671.3452758789062, | |
| "eval_kl": 0.006561279296875, | |
| "eval_loss": 0.03596196323633194, | |
| "eval_reward": 1.8592354953289032, | |
| "eval_reward_std": 0.509850487112999, | |
| "eval_rewards/": 6.89141571521759, | |
| "eval_rewards/math_compute_score": 0.6011904925107956, | |
| "eval_runtime": 91.8744, | |
| "eval_samples_per_second": 0.229, | |
| "eval_steps_per_second": 0.011, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1468.9166870117188, | |
| "epoch": 0.837708830548926, | |
| "grad_norm": 0.253510225674823, | |
| "kl": 0.00677490234375, | |
| "learning_rate": 6.763356384609809e-08, | |
| "loss": 0.0293, | |
| "reward": 1.8625745177268982, | |
| "reward_std": 0.5618998408317566, | |
| "rewards/": 7.3128721714019775, | |
| "rewards/math_compute_score": 0.5000000149011612, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1764.8095703125, | |
| "epoch": 0.8400954653937948, | |
| "grad_norm": 0.24138171322547658, | |
| "kl": 0.005584716796875, | |
| "learning_rate": 6.570341316133272e-08, | |
| "loss": 0.0248, | |
| "reward": 1.5279762744903564, | |
| "reward_std": 0.5388675928115845, | |
| "rewards/": 6.782738208770752, | |
| "rewards/math_compute_score": 0.2142857164144516, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1326.547607421875, | |
| "epoch": 0.8424821002386634, | |
| "grad_norm": 0.2826923722060457, | |
| "kl": 0.00726318359375, | |
| "learning_rate": 6.379926598379725e-08, | |
| "loss": 0.0152, | |
| "reward": 2.1721725463867188, | |
| "reward_std": 0.3049697279930115, | |
| "rewards/": 7.718006134033203, | |
| "rewards/math_compute_score": 0.785714328289032, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1451.4285888671875, | |
| "epoch": 0.8448687350835322, | |
| "grad_norm": 0.3179144298199542, | |
| "kl": 0.006561279296875, | |
| "learning_rate": 6.192123632429985e-08, | |
| "loss": 0.0487, | |
| "reward": 1.677864670753479, | |
| "reward_std": 0.46759727597236633, | |
| "rewards/": 7.151227951049805, | |
| "rewards/math_compute_score": 0.3095238208770752, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1683.1190185546875, | |
| "epoch": 0.847255369928401, | |
| "grad_norm": 0.29414085786077687, | |
| "kl": 0.006103515625, | |
| "learning_rate": 6.006943662986275e-08, | |
| "loss": 0.0241, | |
| "reward": 1.278906226158142, | |
| "reward_std": 0.5312941670417786, | |
| "rewards/": 6.680245876312256, | |
| "rewards/math_compute_score": -0.0714285746216774, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1503.5, | |
| "epoch": 0.8496420047732697, | |
| "grad_norm": 0.3026402934237509, | |
| "kl": 0.0067138671875, | |
| "learning_rate": 5.824397777698858e-08, | |
| "loss": 0.1011, | |
| "reward": 1.7272508144378662, | |
| "reward_std": 0.4776250720024109, | |
| "rewards/": 7.017206192016602, | |
| "rewards/math_compute_score": 0.4047619104385376, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1564.90478515625, | |
| "epoch": 0.8520286396181385, | |
| "grad_norm": 0.2936342545079758, | |
| "kl": 0.00634765625, | |
| "learning_rate": 5.644496906502233e-08, | |
| "loss": 0.0701, | |
| "reward": 1.853273868560791, | |
| "reward_std": 0.5276380181312561, | |
| "rewards/": 7.171131134033203, | |
| "rewards/math_compute_score": 0.523809552192688, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1197.0, | |
| "epoch": 0.8544152744630071, | |
| "grad_norm": 0.30773278900467704, | |
| "kl": 0.00738525390625, | |
| "learning_rate": 5.4672518209607e-08, | |
| "loss": -0.0002, | |
| "reward": 1.991220235824585, | |
| "reward_std": 0.5516120195388794, | |
| "rewards/": 6.622767925262451, | |
| "rewards/math_compute_score": 0.8333333730697632, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1567.166748046875, | |
| "epoch": 0.8568019093078759, | |
| "grad_norm": 0.2473090817946399, | |
| "kl": 0.0057373046875, | |
| "learning_rate": 5.292673133623371e-08, | |
| "loss": -0.0307, | |
| "reward": 1.5518603324890137, | |
| "reward_std": 0.48620352149009705, | |
| "rewards/": 6.711681842803955, | |
| "rewards/math_compute_score": 0.261904776096344, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.8591885441527446, | |
| "grad_norm": 0.23273491855228218, | |
| "learning_rate": 5.1207712973887876e-08, | |
| "loss": 0.0191, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8591885441527446, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 1646.3214721679688, | |
| "eval_kl": 0.00658416748046875, | |
| "eval_loss": 0.017793310806155205, | |
| "eval_reward": 1.8062395751476288, | |
| "eval_reward_std": 0.5819119140505791, | |
| "eval_rewards/": 6.840721607208252, | |
| "eval_rewards/math_compute_score": 0.5476190708577633, | |
| "eval_runtime": 89.4518, | |
| "eval_samples_per_second": 0.235, | |
| "eval_steps_per_second": 0.011, | |
| "step": 360 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 419, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 40, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 14, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |