| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9463722397476341, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 405.5555674235026, | |
| "epoch": 0.0037854889589905363, | |
| "grad_norm": 1.5204231066145135, | |
| "kl": 0.0, | |
| "learning_rate": 5.555555555555555e-09, | |
| "loss": 0.0329, | |
| "reward": 0.3750000099341075, | |
| "reward_std": 0.3891436904668808, | |
| "rewards/equation_reward_func": 0.3472222325702508, | |
| "rewards/format_reward_func": 0.027777778605620067, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 426.1597366333008, | |
| "epoch": 0.007570977917981073, | |
| "grad_norm": 1.6075594847685568, | |
| "kl": 0.00020535786946614584, | |
| "learning_rate": 1.111111111111111e-08, | |
| "loss": 0.0004, | |
| "reward": 0.35416667846341926, | |
| "reward_std": 0.40144437551498413, | |
| "rewards/equation_reward_func": 0.3333333432674408, | |
| "rewards/format_reward_func": 0.02083333395421505, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 389.59028879801434, | |
| "epoch": 0.011356466876971609, | |
| "grad_norm": 1.7738388926882676, | |
| "kl": 0.00020662943522135416, | |
| "learning_rate": 1.6666666666666667e-08, | |
| "loss": 0.0068, | |
| "reward": 0.3611111206312974, | |
| "reward_std": 0.34669753164052963, | |
| "rewards/equation_reward_func": 0.3263888942698638, | |
| "rewards/format_reward_func": 0.034722223257025085, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 407.18751271565753, | |
| "epoch": 0.015141955835962145, | |
| "grad_norm": 1.7413085729902613, | |
| "kl": 0.00020869572957356772, | |
| "learning_rate": 2.222222222222222e-08, | |
| "loss": -0.0175, | |
| "reward": 0.3750000149011612, | |
| "reward_std": 0.43933459122975665, | |
| "rewards/equation_reward_func": 0.36111112497746944, | |
| "rewards/format_reward_func": 0.013888889302810034, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 436.61806615193683, | |
| "epoch": 0.01892744479495268, | |
| "grad_norm": 1.5523678031549322, | |
| "kl": 0.0001990795135498047, | |
| "learning_rate": 2.7777777777777774e-08, | |
| "loss": 0.0009, | |
| "reward": 0.3958333482344945, | |
| "reward_std": 0.4240533635020256, | |
| "rewards/equation_reward_func": 0.3888889029622078, | |
| "rewards/format_reward_func": 0.006944444651405017, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 417.5763982137044, | |
| "epoch": 0.022712933753943218, | |
| "grad_norm": 2.1871520648907357, | |
| "kl": 0.0002460479736328125, | |
| "learning_rate": 3.3333333333333334e-08, | |
| "loss": 0.0668, | |
| "reward": 0.31944445210198563, | |
| "reward_std": 0.3596703422566255, | |
| "rewards/equation_reward_func": 0.31250000807146233, | |
| "rewards/format_reward_func": 0.006944444651405017, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 403.26390075683594, | |
| "epoch": 0.026498422712933754, | |
| "grad_norm": 1.6906264767615913, | |
| "kl": 0.00021004676818847656, | |
| "learning_rate": 3.888888888888889e-08, | |
| "loss": 0.0052, | |
| "reward": 0.3611111268401146, | |
| "reward_std": 0.42362942298253375, | |
| "rewards/equation_reward_func": 0.354166679084301, | |
| "rewards/format_reward_func": 0.006944444651405017, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 393.6944580078125, | |
| "epoch": 0.03028391167192429, | |
| "grad_norm": 1.8134093955469572, | |
| "kl": 0.0002319812774658203, | |
| "learning_rate": 4.444444444444444e-08, | |
| "loss": 0.0291, | |
| "reward": 0.4097222362955411, | |
| "reward_std": 0.43579815079768497, | |
| "rewards/equation_reward_func": 0.3888889004786809, | |
| "rewards/format_reward_func": 0.02083333395421505, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 440.2708460489909, | |
| "epoch": 0.03406940063091483, | |
| "grad_norm": 1.4606786067632986, | |
| "kl": 0.0002152125040690104, | |
| "learning_rate": 5e-08, | |
| "loss": 0.0191, | |
| "reward": 0.3888889004786809, | |
| "reward_std": 0.44846897075573605, | |
| "rewards/equation_reward_func": 0.3680555671453476, | |
| "rewards/format_reward_func": 0.02083333395421505, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 411.56251525878906, | |
| "epoch": 0.03785488958990536, | |
| "grad_norm": 8.74750891658874, | |
| "kl": 0.00022975603739420572, | |
| "learning_rate": 5.555555555555555e-08, | |
| "loss": 0.0165, | |
| "reward": 0.38888889861603576, | |
| "reward_std": 0.3779858859876792, | |
| "rewards/equation_reward_func": 0.38888889861603576, | |
| "rewards/format_reward_func": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 410.5694580078125, | |
| "epoch": 0.0416403785488959, | |
| "grad_norm": 2.0666026097367185, | |
| "kl": 0.0002140204111735026, | |
| "learning_rate": 6.111111111111111e-08, | |
| "loss": 0.0489, | |
| "reward": 0.4305555671453476, | |
| "reward_std": 0.4184086322784424, | |
| "rewards/equation_reward_func": 0.40277779164413613, | |
| "rewards/format_reward_func": 0.027777778605620067, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 398.5972366333008, | |
| "epoch": 0.045425867507886436, | |
| "grad_norm": 1.4353693037214081, | |
| "kl": 0.00022824605305989584, | |
| "learning_rate": 6.666666666666667e-08, | |
| "loss": 0.0561, | |
| "reward": 0.39583334140479565, | |
| "reward_std": 0.38249212006727856, | |
| "rewards/equation_reward_func": 0.37500000807146233, | |
| "rewards/format_reward_func": 0.02083333395421505, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 406.06251525878906, | |
| "epoch": 0.04921135646687697, | |
| "grad_norm": 2.0297758030760487, | |
| "kl": 0.00023778279622395834, | |
| "learning_rate": 7.222222222222221e-08, | |
| "loss": -0.036, | |
| "reward": 0.2847222263614337, | |
| "reward_std": 0.35836515327294666, | |
| "rewards/equation_reward_func": 0.2638888942698638, | |
| "rewards/format_reward_func": 0.02083333395421505, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 404.18751271565753, | |
| "epoch": 0.05299684542586751, | |
| "grad_norm": 1.7807227026542323, | |
| "kl": 0.0002464453379313151, | |
| "learning_rate": 7.777777777777778e-08, | |
| "loss": -0.0037, | |
| "reward": 0.3819444552063942, | |
| "reward_std": 0.3984878833095233, | |
| "rewards/equation_reward_func": 0.37500001055498916, | |
| "rewards/format_reward_func": 0.006944444651405017, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 443.22918192545575, | |
| "epoch": 0.056782334384858045, | |
| "grad_norm": 1.4639198723709446, | |
| "kl": 0.0002082983652750651, | |
| "learning_rate": 8.333333333333333e-08, | |
| "loss": 0.0215, | |
| "reward": 0.28472222946584225, | |
| "reward_std": 0.35284433389703435, | |
| "rewards/equation_reward_func": 0.26388889613250893, | |
| "rewards/format_reward_func": 0.02083333395421505, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 417.1458460489909, | |
| "epoch": 0.06056782334384858, | |
| "grad_norm": 1.551783394227111, | |
| "kl": 0.0002196629842122396, | |
| "learning_rate": 8.888888888888888e-08, | |
| "loss": -0.0381, | |
| "reward": 0.4236111231148243, | |
| "reward_std": 0.4627470038831234, | |
| "rewards/equation_reward_func": 0.409722230086724, | |
| "rewards/format_reward_func": 0.013888889302810034, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 425.0972366333008, | |
| "epoch": 0.06435331230283911, | |
| "grad_norm": 1.6519839518228945, | |
| "kl": 0.0002177556355794271, | |
| "learning_rate": 9.444444444444444e-08, | |
| "loss": 0.0149, | |
| "reward": 0.28472222946584225, | |
| "reward_std": 0.36097555483380955, | |
| "rewards/equation_reward_func": 0.26388889489074546, | |
| "rewards/format_reward_func": 0.02083333395421505, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 416.1597315470378, | |
| "epoch": 0.06813880126182965, | |
| "grad_norm": 1.6298419922409495, | |
| "kl": 0.00024358431498209635, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0544, | |
| "reward": 0.31250000931322575, | |
| "reward_std": 0.406619085619847, | |
| "rewards/equation_reward_func": 0.27777778667708236, | |
| "rewards/format_reward_func": 0.034722223257025085, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 406.13195546468097, | |
| "epoch": 0.07192429022082018, | |
| "grad_norm": 1.7781933588930947, | |
| "kl": 0.00020241737365722656, | |
| "learning_rate": 1.0555555555555555e-07, | |
| "loss": 0.0181, | |
| "reward": 0.5208333432674408, | |
| "reward_std": 0.48631447553634644, | |
| "rewards/equation_reward_func": 0.5000000074505806, | |
| "rewards/format_reward_func": 0.02083333395421505, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 442.87500890096027, | |
| "epoch": 0.07570977917981073, | |
| "grad_norm": 1.7856778327927993, | |
| "kl": 0.00023746490478515625, | |
| "learning_rate": 1.111111111111111e-07, | |
| "loss": -0.0027, | |
| "reward": 0.32638889613250893, | |
| "reward_std": 0.37259839847683907, | |
| "rewards/equation_reward_func": 0.31250000807146233, | |
| "rewards/format_reward_func": 0.013888889302810034, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 375.65973409016925, | |
| "epoch": 0.07949526813880126, | |
| "grad_norm": 1.9930393569793248, | |
| "kl": 0.00021648406982421875, | |
| "learning_rate": 1.1666666666666667e-07, | |
| "loss": 0.0641, | |
| "reward": 0.4236111268401146, | |
| "reward_std": 0.38463745390375453, | |
| "rewards/equation_reward_func": 0.4027777910232544, | |
| "rewards/format_reward_func": 0.02083333395421505, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 411.6736195882161, | |
| "epoch": 0.0832807570977918, | |
| "grad_norm": 1.4888095636144503, | |
| "kl": 0.0002304712931315104, | |
| "learning_rate": 1.2222222222222222e-07, | |
| "loss": 0.0313, | |
| "reward": 0.31944445210198563, | |
| "reward_std": 0.3178868380685647, | |
| "rewards/equation_reward_func": 0.28472223194936913, | |
| "rewards/format_reward_func": 0.034722223257025085, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 389.4166793823242, | |
| "epoch": 0.08706624605678233, | |
| "grad_norm": 1.6283738307368585, | |
| "kl": 0.00023396809895833334, | |
| "learning_rate": 1.2777777777777777e-07, | |
| "loss": 0.0686, | |
| "reward": 0.2986111169060071, | |
| "reward_std": 0.37988172471523285, | |
| "rewards/equation_reward_func": 0.2847222288449605, | |
| "rewards/format_reward_func": 0.013888889302810034, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 405.71528879801434, | |
| "epoch": 0.09085173501577287, | |
| "grad_norm": 12.938622660748152, | |
| "kl": 0.00023698806762695312, | |
| "learning_rate": 1.3333333333333334e-07, | |
| "loss": -0.0074, | |
| "reward": 0.2361111187686523, | |
| "reward_std": 0.3309923857450485, | |
| "rewards/equation_reward_func": 0.22222222574055195, | |
| "rewards/format_reward_func": 0.013888889302810034, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 434.0694580078125, | |
| "epoch": 0.0946372239747634, | |
| "grad_norm": 3.061348126208135, | |
| "kl": 0.00024008750915527344, | |
| "learning_rate": 1.3888888888888888e-07, | |
| "loss": -0.012, | |
| "reward": 0.27083334264655906, | |
| "reward_std": 0.34488533437252045, | |
| "rewards/equation_reward_func": 0.25694445086022216, | |
| "rewards/format_reward_func": 0.013888889302810034, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 413.0694529215495, | |
| "epoch": 0.09842271293375394, | |
| "grad_norm": 3.454024027390986, | |
| "kl": 0.0003235340118408203, | |
| "learning_rate": 1.4444444444444442e-07, | |
| "loss": -0.0064, | |
| "reward": 0.40972223194936913, | |
| "reward_std": 0.3772713306049506, | |
| "rewards/equation_reward_func": 0.40277778916060925, | |
| "rewards/format_reward_func": 0.006944444651405017, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 446.2986195882161, | |
| "epoch": 0.10220820189274447, | |
| "grad_norm": 1.6865767812775654, | |
| "kl": 0.00020933151245117188, | |
| "learning_rate": 1.5e-07, | |
| "loss": 0.0067, | |
| "reward": 0.3750000136593978, | |
| "reward_std": 0.36897342403729755, | |
| "rewards/equation_reward_func": 0.3611111231148243, | |
| "rewards/format_reward_func": 0.013888889302810034, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 441.1736246744792, | |
| "epoch": 0.10599369085173502, | |
| "grad_norm": 2.2986869828700334, | |
| "kl": 0.0004001458485921224, | |
| "learning_rate": 1.5555555555555556e-07, | |
| "loss": 0.0206, | |
| "reward": 0.3541666716337204, | |
| "reward_std": 0.3243444561958313, | |
| "rewards/equation_reward_func": 0.3472222263614337, | |
| "rewards/format_reward_func": 0.006944444651405017, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 432.27085240681964, | |
| "epoch": 0.10977917981072555, | |
| "grad_norm": 2.1906732682758645, | |
| "kl": 0.0002334117889404297, | |
| "learning_rate": 1.611111111111111e-07, | |
| "loss": 0.0075, | |
| "reward": 0.40972223194936913, | |
| "reward_std": 0.4255252617100875, | |
| "rewards/equation_reward_func": 0.39583334513008595, | |
| "rewards/format_reward_func": 0.013888889302810034, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 426.7569529215495, | |
| "epoch": 0.11356466876971609, | |
| "grad_norm": 2.041653144281195, | |
| "kl": 0.0002582073211669922, | |
| "learning_rate": 1.6666666666666665e-07, | |
| "loss": 0.0211, | |
| "reward": 0.3680555634200573, | |
| "reward_std": 0.40922948469718295, | |
| "rewards/equation_reward_func": 0.361111119389534, | |
| "rewards/format_reward_func": 0.006944444651405017, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 400.68751271565753, | |
| "epoch": 0.11735015772870662, | |
| "grad_norm": 1.93215349954409, | |
| "kl": 0.0002829233805338542, | |
| "learning_rate": 1.7222222222222222e-07, | |
| "loss": -0.033, | |
| "reward": 0.4097222400208314, | |
| "reward_std": 0.45954596251249313, | |
| "rewards/equation_reward_func": 0.3888889004786809, | |
| "rewards/format_reward_func": 0.02083333395421505, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 429.3125178019206, | |
| "epoch": 0.12113564668769716, | |
| "grad_norm": 1.8873459369947734, | |
| "kl": 0.0002741813659667969, | |
| "learning_rate": 1.7777777777777776e-07, | |
| "loss": 0.0397, | |
| "reward": 0.4305555708706379, | |
| "reward_std": 0.41432634244362515, | |
| "rewards/equation_reward_func": 0.4027777848144372, | |
| "rewards/format_reward_func": 0.027777778605620067, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 421.88890329996747, | |
| "epoch": 0.12492113564668769, | |
| "grad_norm": 1.688287521815126, | |
| "kl": 0.00026599566141764325, | |
| "learning_rate": 1.833333333333333e-07, | |
| "loss": 0.0008, | |
| "reward": 0.3472222276031971, | |
| "reward_std": 0.3376887192328771, | |
| "rewards/equation_reward_func": 0.3402777823309104, | |
| "rewards/format_reward_func": 0.006944444651405017, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 452.2708447774251, | |
| "epoch": 0.12870662460567822, | |
| "grad_norm": 1.37663800323155, | |
| "kl": 0.0003294944763183594, | |
| "learning_rate": 1.8888888888888888e-07, | |
| "loss": 0.0556, | |
| "reward": 0.3263888992369175, | |
| "reward_std": 0.2874133574465911, | |
| "rewards/equation_reward_func": 0.312500008692344, | |
| "rewards/format_reward_func": 0.013888889302810034, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 436.71528879801434, | |
| "epoch": 0.13249211356466878, | |
| "grad_norm": 1.4483289492436444, | |
| "kl": 0.00029428799947102863, | |
| "learning_rate": 1.9444444444444445e-07, | |
| "loss": 0.0386, | |
| "reward": 0.29861111876865226, | |
| "reward_std": 0.3121309739847978, | |
| "rewards/equation_reward_func": 0.28472222822407883, | |
| "rewards/format_reward_func": 0.013888889302810034, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 414.62501271565753, | |
| "epoch": 0.1362776025236593, | |
| "grad_norm": 1.4269628803342047, | |
| "kl": 0.0002837181091308594, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0402, | |
| "reward": 0.32638889861603576, | |
| "reward_std": 0.35836514706412953, | |
| "rewards/equation_reward_func": 0.29861111752688885, | |
| "rewards/format_reward_func": 0.027777778605620067, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 398.69445546468097, | |
| "epoch": 0.14006309148264984, | |
| "grad_norm": 1.3415769326825684, | |
| "kl": 0.00044043858846028644, | |
| "learning_rate": 2.0555555555555553e-07, | |
| "loss": -0.0574, | |
| "reward": 0.3333333420256774, | |
| "reward_std": 0.33815376708904904, | |
| "rewards/equation_reward_func": 0.31250000682969886, | |
| "rewards/format_reward_func": 0.02083333395421505, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 397.2708460489909, | |
| "epoch": 0.14384858044164037, | |
| "grad_norm": 1.6736466199506606, | |
| "kl": 0.0003532568613688151, | |
| "learning_rate": 2.111111111111111e-07, | |
| "loss": 0.0355, | |
| "reward": 0.3958333519597848, | |
| "reward_std": 0.3349916177491347, | |
| "rewards/equation_reward_func": 0.3750000173846881, | |
| "rewards/format_reward_func": 0.02083333395421505, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 389.6388982137044, | |
| "epoch": 0.14763406940063092, | |
| "grad_norm": 1.6778745969677393, | |
| "kl": 0.0004076957702636719, | |
| "learning_rate": 2.1666666666666667e-07, | |
| "loss": -0.0089, | |
| "reward": 0.3819444514811039, | |
| "reward_std": 0.3766806833446026, | |
| "rewards/equation_reward_func": 0.3680555646618207, | |
| "rewards/format_reward_func": 0.013888889302810034, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 369.4305648803711, | |
| "epoch": 0.15141955835962145, | |
| "grad_norm": 1.7763243336052263, | |
| "kl": 0.0004963874816894531, | |
| "learning_rate": 2.222222222222222e-07, | |
| "loss": 0.0596, | |
| "reward": 0.3541666716337204, | |
| "reward_std": 0.4322179580728213, | |
| "rewards/equation_reward_func": 0.3125000049670537, | |
| "rewards/format_reward_func": 0.0416666679084301, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 418.77778879801434, | |
| "epoch": 0.15520504731861198, | |
| "grad_norm": 2.4954676920223084, | |
| "kl": 0.0005669593811035156, | |
| "learning_rate": 2.2777777777777776e-07, | |
| "loss": 0.0353, | |
| "reward": 0.4583333469927311, | |
| "reward_std": 0.4091739282011986, | |
| "rewards/equation_reward_func": 0.4305555659035842, | |
| "rewards/format_reward_func": 0.027777778605620067, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 389.9583460489909, | |
| "epoch": 0.1589905362776025, | |
| "grad_norm": 1.9030809806319569, | |
| "kl": 0.0004928906758626302, | |
| "learning_rate": 2.3333333333333333e-07, | |
| "loss": 0.047, | |
| "reward": 0.4236111255983512, | |
| "reward_std": 0.4178568907082081, | |
| "rewards/equation_reward_func": 0.4097222325702508, | |
| "rewards/format_reward_func": 0.013888889302810034, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 442.4930648803711, | |
| "epoch": 0.16277602523659307, | |
| "grad_norm": 1.3770817743623749, | |
| "kl": 0.0006133715311686198, | |
| "learning_rate": 2.388888888888889e-07, | |
| "loss": 0.0004, | |
| "reward": 0.4166666753590107, | |
| "reward_std": 0.37612894798318547, | |
| "rewards/equation_reward_func": 0.3958333469927311, | |
| "rewards/format_reward_func": 0.02083333395421505, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 443.9444529215495, | |
| "epoch": 0.1665615141955836, | |
| "grad_norm": 1.4650231298226628, | |
| "kl": 0.0006745656331380209, | |
| "learning_rate": 2.4444444444444445e-07, | |
| "loss": -0.0017, | |
| "reward": 0.3750000111758709, | |
| "reward_std": 0.3954201638698578, | |
| "rewards/equation_reward_func": 0.36805556652446586, | |
| "rewards/format_reward_func": 0.006944444651405017, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 429.5069630940755, | |
| "epoch": 0.17034700315457413, | |
| "grad_norm": 1.9750543844698667, | |
| "kl": 0.000976403554280599, | |
| "learning_rate": 2.5e-07, | |
| "loss": 0.0418, | |
| "reward": 0.36805556528270245, | |
| "reward_std": 0.4254308380186558, | |
| "rewards/equation_reward_func": 0.34027778543531895, | |
| "rewards/format_reward_func": 0.027777778605620067, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 414.8263982137044, | |
| "epoch": 0.17413249211356466, | |
| "grad_norm": 3.4819218815816417, | |
| "kl": 0.000812689463297526, | |
| "learning_rate": 2.5555555555555553e-07, | |
| "loss": -0.073, | |
| "reward": 0.4166666766007741, | |
| "reward_std": 0.3864077205459277, | |
| "rewards/equation_reward_func": 0.4097222313284874, | |
| "rewards/format_reward_func": 0.006944444651405017, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 358.4236195882161, | |
| "epoch": 0.17791798107255521, | |
| "grad_norm": 3.17839003794858, | |
| "kl": 0.0010786056518554688, | |
| "learning_rate": 2.6111111111111113e-07, | |
| "loss": 0.0271, | |
| "reward": 0.4305555634200573, | |
| "reward_std": 0.4322568451364835, | |
| "rewards/equation_reward_func": 0.4027777872979641, | |
| "rewards/format_reward_func": 0.027777778605620067, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 421.7361208597819, | |
| "epoch": 0.18170347003154574, | |
| "grad_norm": 1.581354739245434, | |
| "kl": 0.0016581217447916667, | |
| "learning_rate": 2.6666666666666667e-07, | |
| "loss": -0.0128, | |
| "reward": 0.40972223194936913, | |
| "reward_std": 0.4322179468969504, | |
| "rewards/equation_reward_func": 0.40277778667708236, | |
| "rewards/format_reward_func": 0.006944444651405017, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 378.01390075683594, | |
| "epoch": 0.18548895899053627, | |
| "grad_norm": 1.6207630586247244, | |
| "kl": 0.0008861223856608073, | |
| "learning_rate": 2.7222222222222216e-07, | |
| "loss": -0.0275, | |
| "reward": 0.48611112001041573, | |
| "reward_std": 0.38690390810370445, | |
| "rewards/equation_reward_func": 0.4513888992369175, | |
| "rewards/format_reward_func": 0.034722223257025085, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 410.0486208597819, | |
| "epoch": 0.1892744479495268, | |
| "grad_norm": 1.6017744101080356, | |
| "kl": 0.0018717447916666667, | |
| "learning_rate": 2.7777777777777776e-07, | |
| "loss": 0.0666, | |
| "reward": 0.4166666778425376, | |
| "reward_std": 0.4304381770392259, | |
| "rewards/equation_reward_func": 0.38888889489074546, | |
| "rewards/format_reward_func": 0.027777778605620067, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 417.06945546468097, | |
| "epoch": 0.19305993690851736, | |
| "grad_norm": 1.781222955044469, | |
| "kl": 0.0016377766927083333, | |
| "learning_rate": 2.833333333333333e-07, | |
| "loss": 0.01, | |
| "reward": 0.3750000037252903, | |
| "reward_std": 0.4017697374025981, | |
| "rewards/equation_reward_func": 0.3472222263614337, | |
| "rewards/format_reward_func": 0.027777778605620067, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 394.3333460489909, | |
| "epoch": 0.1968454258675079, | |
| "grad_norm": 2.071755215843617, | |
| "kl": 0.0020945866902669272, | |
| "learning_rate": 2.8888888888888885e-07, | |
| "loss": 0.0326, | |
| "reward": 0.46527778543531895, | |
| "reward_std": 0.4230251908302307, | |
| "rewards/equation_reward_func": 0.42361111876865226, | |
| "rewards/format_reward_func": 0.0416666679084301, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 381.5902913411458, | |
| "epoch": 0.20063091482649842, | |
| "grad_norm": 4.141898501255106, | |
| "kl": 0.002117792765299479, | |
| "learning_rate": 2.9444444444444444e-07, | |
| "loss": 0.0573, | |
| "reward": 0.3819444632778565, | |
| "reward_std": 0.35283846283952397, | |
| "rewards/equation_reward_func": 0.36111112249394256, | |
| "rewards/format_reward_func": 0.02083333395421505, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 427.81251271565753, | |
| "epoch": 0.20441640378548895, | |
| "grad_norm": 1.8973650475253758, | |
| "kl": 0.0040442148844401045, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0408, | |
| "reward": 0.40277778543531895, | |
| "reward_std": 0.3682141068081061, | |
| "rewards/equation_reward_func": 0.38194445582727593, | |
| "rewards/format_reward_func": 0.02083333395421505, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 419.87501017252606, | |
| "epoch": 0.2082018927444795, | |
| "grad_norm": 1.7756216737476342, | |
| "kl": 0.0018666585286458333, | |
| "learning_rate": 3.055555555555556e-07, | |
| "loss": -0.0149, | |
| "reward": 0.4722222350537777, | |
| "reward_std": 0.4178180123368899, | |
| "rewards/equation_reward_func": 0.45833334513008595, | |
| "rewards/format_reward_func": 0.013888889302810034, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 418.2222315470378, | |
| "epoch": 0.21198738170347003, | |
| "grad_norm": 1.5674546331115733, | |
| "kl": 0.0031108856201171875, | |
| "learning_rate": 3.111111111111111e-07, | |
| "loss": -0.0077, | |
| "reward": 0.36805556279917556, | |
| "reward_std": 0.36998799939950305, | |
| "rewards/equation_reward_func": 0.34027778543531895, | |
| "rewards/format_reward_func": 0.027777778605620067, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 365.8333460489909, | |
| "epoch": 0.21577287066246056, | |
| "grad_norm": 2.163488952988276, | |
| "kl": 0.003872553507486979, | |
| "learning_rate": 3.166666666666666e-07, | |
| "loss": 0.0248, | |
| "reward": 0.44444446079432964, | |
| "reward_std": 0.44138550013303757, | |
| "rewards/equation_reward_func": 0.41666667722165585, | |
| "rewards/format_reward_func": 0.027777778605620067, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 377.95140329996747, | |
| "epoch": 0.2195583596214511, | |
| "grad_norm": 1.9890628606938958, | |
| "kl": 0.0058383941650390625, | |
| "learning_rate": 3.222222222222222e-07, | |
| "loss": 0.0108, | |
| "reward": 0.44444446203609306, | |
| "reward_std": 0.4227793253958225, | |
| "rewards/equation_reward_func": 0.4097222338120143, | |
| "rewards/format_reward_func": 0.034722223257025085, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 390.0833485921224, | |
| "epoch": 0.22334384858044165, | |
| "grad_norm": 1.6950159200633848, | |
| "kl": 0.0033391316731770835, | |
| "learning_rate": 3.2777777777777776e-07, | |
| "loss": 0.0218, | |
| "reward": 0.5000000149011612, | |
| "reward_std": 0.46135225395361584, | |
| "rewards/equation_reward_func": 0.472222238779068, | |
| "rewards/format_reward_func": 0.027777778605620067, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 434.47223409016925, | |
| "epoch": 0.22712933753943218, | |
| "grad_norm": 1.6912972588519333, | |
| "kl": 0.0037129720052083335, | |
| "learning_rate": 3.333333333333333e-07, | |
| "loss": 0.0303, | |
| "reward": 0.402777789781491, | |
| "reward_std": 0.37431980296969414, | |
| "rewards/equation_reward_func": 0.3888888992369175, | |
| "rewards/format_reward_func": 0.013888889302810034, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 379.2916768391927, | |
| "epoch": 0.2309148264984227, | |
| "grad_norm": 1.7665244345202618, | |
| "kl": 0.010921478271484375, | |
| "learning_rate": 3.388888888888889e-07, | |
| "loss": 0.0464, | |
| "reward": 0.38194445086022216, | |
| "reward_std": 0.3815583561857541, | |
| "rewards/equation_reward_func": 0.3402777835726738, | |
| "rewards/format_reward_func": 0.041666667287548385, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 346.40973409016925, | |
| "epoch": 0.23470031545741324, | |
| "grad_norm": 1.7777246622024319, | |
| "kl": 0.0057525634765625, | |
| "learning_rate": 3.4444444444444444e-07, | |
| "loss": 0.073, | |
| "reward": 0.5555555739750465, | |
| "reward_std": 0.5072049958010515, | |
| "rewards/equation_reward_func": 0.5138889104127884, | |
| "rewards/format_reward_func": 0.0416666679084301, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 394.50001525878906, | |
| "epoch": 0.2384858044164038, | |
| "grad_norm": 1.2724495191880816, | |
| "kl": 0.005407969156901042, | |
| "learning_rate": 3.5e-07, | |
| "loss": 0.0085, | |
| "reward": 0.5416666809469461, | |
| "reward_std": 0.4325893906255563, | |
| "rewards/equation_reward_func": 0.5138889035830895, | |
| "rewards/format_reward_func": 0.027777778605620067, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 426.51390584309894, | |
| "epoch": 0.24227129337539433, | |
| "grad_norm": 1.7084837781655138, | |
| "kl": 0.013666788736979166, | |
| "learning_rate": 3.5555555555555553e-07, | |
| "loss": -0.0031, | |
| "reward": 0.40277779288589954, | |
| "reward_std": 0.4025290633241336, | |
| "rewards/equation_reward_func": 0.36805557149151963, | |
| "rewards/format_reward_func": 0.034722223257025085, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 393.2986195882161, | |
| "epoch": 0.24605678233438485, | |
| "grad_norm": 1.6652131128084247, | |
| "kl": 0.009862263997395834, | |
| "learning_rate": 3.6111111111111107e-07, | |
| "loss": 0.076, | |
| "reward": 0.5277777922650179, | |
| "reward_std": 0.4298570702473323, | |
| "rewards/equation_reward_func": 0.5000000111758709, | |
| "rewards/format_reward_func": 0.027777778605620067, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 387.29862213134766, | |
| "epoch": 0.24984227129337538, | |
| "grad_norm": 1.4235829543070357, | |
| "kl": 0.006196339925130208, | |
| "learning_rate": 3.666666666666666e-07, | |
| "loss": 0.0928, | |
| "reward": 0.5763888955116272, | |
| "reward_std": 0.4299643337726593, | |
| "rewards/equation_reward_func": 0.5277777959903082, | |
| "rewards/format_reward_func": 0.048611112559835114, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 412.95140329996747, | |
| "epoch": 0.25362776025236594, | |
| "grad_norm": 1.5899891064412919, | |
| "kl": 0.010592142740885416, | |
| "learning_rate": 3.722222222222222e-07, | |
| "loss": -0.0172, | |
| "reward": 0.46527778543531895, | |
| "reward_std": 0.4477427862584591, | |
| "rewards/equation_reward_func": 0.39583334264655906, | |
| "rewards/format_reward_func": 0.06944444589316845, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 375.7916768391927, | |
| "epoch": 0.25741324921135644, | |
| "grad_norm": 1.8581102947843942, | |
| "kl": 0.0097503662109375, | |
| "learning_rate": 3.7777777777777775e-07, | |
| "loss": 0.0656, | |
| "reward": 0.3819444539646308, | |
| "reward_std": 0.4159533294538657, | |
| "rewards/equation_reward_func": 0.3263888979951541, | |
| "rewards/format_reward_func": 0.055555557211240135, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 402.8333460489909, | |
| "epoch": 0.261198738170347, | |
| "grad_norm": 1.8615199132284905, | |
| "kl": 0.00942230224609375, | |
| "learning_rate": 3.8333333333333335e-07, | |
| "loss": 0.1235, | |
| "reward": 0.3958333407839139, | |
| "reward_std": 0.4607119709253311, | |
| "rewards/equation_reward_func": 0.33333334637184936, | |
| "rewards/format_reward_func": 0.06250000186264515, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 388.50695037841797, | |
| "epoch": 0.26498422712933756, | |
| "grad_norm": 1.4132445755845657, | |
| "kl": 0.031420389811197914, | |
| "learning_rate": 3.888888888888889e-07, | |
| "loss": -0.0025, | |
| "reward": 0.6388889079292616, | |
| "reward_std": 0.4517383811374505, | |
| "rewards/equation_reward_func": 0.5972222362955412, | |
| "rewards/format_reward_func": 0.0416666679084301, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 399.69445546468097, | |
| "epoch": 0.26876971608832806, | |
| "grad_norm": 1.6231049342643675, | |
| "kl": 0.011019388834635416, | |
| "learning_rate": 3.9444444444444444e-07, | |
| "loss": -0.0123, | |
| "reward": 0.5486111268401146, | |
| "reward_std": 0.4811764856179555, | |
| "rewards/equation_reward_func": 0.486111119389534, | |
| "rewards/format_reward_func": 0.06250000186264515, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 386.03473409016925, | |
| "epoch": 0.2725552050473186, | |
| "grad_norm": 1.5773905454247337, | |
| "kl": 0.04953765869140625, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0301, | |
| "reward": 0.5486111262192329, | |
| "reward_std": 0.44820784653226536, | |
| "rewards/equation_reward_func": 0.48611111876865226, | |
| "rewards/format_reward_func": 0.06250000124176343, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 398.125005086263, | |
| "epoch": 0.2763406940063092, | |
| "grad_norm": 1.2456280183136625, | |
| "kl": 0.025739034016927082, | |
| "learning_rate": 4.055555555555555e-07, | |
| "loss": 0.0387, | |
| "reward": 0.5000000186264515, | |
| "reward_std": 0.321004219353199, | |
| "rewards/equation_reward_func": 0.4444444552063942, | |
| "rewards/format_reward_func": 0.055555557211240135, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 366.2986195882161, | |
| "epoch": 0.2801261829652997, | |
| "grad_norm": 1.5044443965642595, | |
| "kl": 0.021631876627604168, | |
| "learning_rate": 4.1111111111111107e-07, | |
| "loss": 0.0575, | |
| "reward": 0.43055557273328304, | |
| "reward_std": 0.34387076273560524, | |
| "rewards/equation_reward_func": 0.38888889613250893, | |
| "rewards/format_reward_func": 0.0416666679084301, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 372.86112721761066, | |
| "epoch": 0.28391167192429023, | |
| "grad_norm": 2.033550282447034, | |
| "kl": 0.02593231201171875, | |
| "learning_rate": 4.1666666666666667e-07, | |
| "loss": 0.1121, | |
| "reward": 0.6805555721124014, | |
| "reward_std": 0.5391590123375257, | |
| "rewards/equation_reward_func": 0.5486111287027597, | |
| "rewards/format_reward_func": 0.1319444477558136, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 340.0138969421387, | |
| "epoch": 0.28769716088328073, | |
| "grad_norm": 2.2987693590791363, | |
| "kl": 0.0315399169921875, | |
| "learning_rate": 4.222222222222222e-07, | |
| "loss": 0.0078, | |
| "reward": 0.5694444663822651, | |
| "reward_std": 0.4559611765046914, | |
| "rewards/equation_reward_func": 0.45833334761361283, | |
| "rewards/format_reward_func": 0.11111111318071683, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 378.94445546468097, | |
| "epoch": 0.2914826498422713, | |
| "grad_norm": 1.9562773273854706, | |
| "kl": 0.09186299641927083, | |
| "learning_rate": 4.2777777777777775e-07, | |
| "loss": 0.0201, | |
| "reward": 0.6041666741172472, | |
| "reward_std": 0.49037906900048256, | |
| "rewards/equation_reward_func": 0.500000017384688, | |
| "rewards/format_reward_func": 0.10416666915019353, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 360.2152913411458, | |
| "epoch": 0.29526813880126185, | |
| "grad_norm": 1.8560446617911341, | |
| "kl": 0.060872395833333336, | |
| "learning_rate": 4.3333333333333335e-07, | |
| "loss": 0.0208, | |
| "reward": 0.5208333457509676, | |
| "reward_std": 0.4362143650650978, | |
| "rewards/equation_reward_func": 0.4444444564481576, | |
| "rewards/format_reward_func": 0.07638888992369175, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 397.4930725097656, | |
| "epoch": 0.29905362776025235, | |
| "grad_norm": 1.7722682766121323, | |
| "kl": 0.03704833984375, | |
| "learning_rate": 4.3888888888888884e-07, | |
| "loss": 0.087, | |
| "reward": 0.6736111355324587, | |
| "reward_std": 0.4783005639910698, | |
| "rewards/equation_reward_func": 0.5277777922650179, | |
| "rewards/format_reward_func": 0.14583333705862364, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 340.4583485921224, | |
| "epoch": 0.3028391167192429, | |
| "grad_norm": 1.8020563834520036, | |
| "kl": 0.052164713541666664, | |
| "learning_rate": 4.444444444444444e-07, | |
| "loss": 0.0567, | |
| "reward": 0.4791666753590107, | |
| "reward_std": 0.42059509828686714, | |
| "rewards/equation_reward_func": 0.4027777860562007, | |
| "rewards/format_reward_func": 0.07638888992369175, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 331.2986234029134, | |
| "epoch": 0.30662460567823346, | |
| "grad_norm": 1.816923697504477, | |
| "kl": 0.14789835611979166, | |
| "learning_rate": 4.5e-07, | |
| "loss": 0.0408, | |
| "reward": 0.6319444521019856, | |
| "reward_std": 0.40209560344616574, | |
| "rewards/equation_reward_func": 0.5208333420256773, | |
| "rewards/format_reward_func": 0.11111111318071683, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 374.74306615193683, | |
| "epoch": 0.31041009463722397, | |
| "grad_norm": 1.6763316060977995, | |
| "kl": 0.04315185546875, | |
| "learning_rate": 4.555555555555555e-07, | |
| "loss": 0.0657, | |
| "reward": 0.652777798473835, | |
| "reward_std": 0.47464097539583844, | |
| "rewards/equation_reward_func": 0.4722222313284874, | |
| "rewards/format_reward_func": 0.180555559694767, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 377.5416768391927, | |
| "epoch": 0.3141955835962145, | |
| "grad_norm": 1.6769854770195711, | |
| "kl": 0.069580078125, | |
| "learning_rate": 4.611111111111111e-07, | |
| "loss": 0.0887, | |
| "reward": 0.6527777897814909, | |
| "reward_std": 0.5109836533665657, | |
| "rewards/equation_reward_func": 0.4722222400208314, | |
| "rewards/format_reward_func": 0.180555559694767, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 355.3680648803711, | |
| "epoch": 0.317981072555205, | |
| "grad_norm": 1.5011515214179916, | |
| "kl": 0.19896443684895834, | |
| "learning_rate": 4.6666666666666666e-07, | |
| "loss": 0.0834, | |
| "reward": 0.7638889228304228, | |
| "reward_std": 0.5609942426284155, | |
| "rewards/equation_reward_func": 0.569444460173448, | |
| "rewards/format_reward_func": 0.1944444483766953, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 353.7361195882161, | |
| "epoch": 0.3217665615141956, | |
| "grad_norm": 15.47233883807027, | |
| "kl": 0.056732177734375, | |
| "learning_rate": 4.722222222222222e-07, | |
| "loss": 0.0426, | |
| "reward": 0.7500000124176344, | |
| "reward_std": 0.5186516791582108, | |
| "rewards/equation_reward_func": 0.5000000161429247, | |
| "rewards/format_reward_func": 0.25000000931322575, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 308.31251017252606, | |
| "epoch": 0.32555205047318614, | |
| "grad_norm": 2.1360230493136325, | |
| "kl": 0.19114176432291666, | |
| "learning_rate": 4.777777777777778e-07, | |
| "loss": 0.1307, | |
| "reward": 0.8055555882553259, | |
| "reward_std": 0.5721215779582659, | |
| "rewards/equation_reward_func": 0.5000000086923441, | |
| "rewards/format_reward_func": 0.30555556404093903, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 314.75000762939453, | |
| "epoch": 0.32933753943217664, | |
| "grad_norm": 2.7019951929627415, | |
| "kl": 0.1749267578125, | |
| "learning_rate": 4.833333333333333e-07, | |
| "loss": 0.0575, | |
| "reward": 0.826388897995154, | |
| "reward_std": 0.5692646453777949, | |
| "rewards/equation_reward_func": 0.5138889017204443, | |
| "rewards/format_reward_func": 0.312500008692344, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 297.9722328186035, | |
| "epoch": 0.3331230283911672, | |
| "grad_norm": 2.3496202759776788, | |
| "kl": 0.3324991861979167, | |
| "learning_rate": 4.888888888888889e-07, | |
| "loss": 0.0734, | |
| "reward": 0.868055577079455, | |
| "reward_std": 0.6303805137674013, | |
| "rewards/equation_reward_func": 0.486111128081878, | |
| "rewards/format_reward_func": 0.3819444564481576, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 349.2708384195964, | |
| "epoch": 0.33690851735015775, | |
| "grad_norm": 7.352824169943818, | |
| "kl": 0.4977823893229167, | |
| "learning_rate": 4.944444444444445e-07, | |
| "loss": 0.0453, | |
| "reward": 0.770833362514774, | |
| "reward_std": 0.6322847319145998, | |
| "rewards/equation_reward_func": 0.3958333370586236, | |
| "rewards/format_reward_func": 0.37500001179675263, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 297.1666781107585, | |
| "epoch": 0.34069400630914826, | |
| "grad_norm": 24.940449042597443, | |
| "kl": 4.795857747395833, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0703, | |
| "reward": 1.0555555820465088, | |
| "reward_std": 0.5782317991058031, | |
| "rewards/equation_reward_func": 0.5555555745959282, | |
| "rewards/format_reward_func": 0.5000000211099783, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 271.9027849833171, | |
| "epoch": 0.3444794952681388, | |
| "grad_norm": 2.5871456184851414, | |
| "kl": 14.199259440104166, | |
| "learning_rate": 4.999998543120144e-07, | |
| "loss": 0.0763, | |
| "reward": 1.0902778108914692, | |
| "reward_std": 0.5762393027544022, | |
| "rewards/equation_reward_func": 0.5694444589316845, | |
| "rewards/format_reward_func": 0.5208333482344946, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 309.31250890096027, | |
| "epoch": 0.3482649842271293, | |
| "grad_norm": 2.0327389699765313, | |
| "kl": 0.7781168619791666, | |
| "learning_rate": 4.999994172482276e-07, | |
| "loss": 0.1347, | |
| "reward": 0.895833358168602, | |
| "reward_std": 0.5533264875411987, | |
| "rewards/equation_reward_func": 0.43750001055498916, | |
| "rewards/format_reward_func": 0.4583333469927311, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 268.13889567057294, | |
| "epoch": 0.35205047318611987, | |
| "grad_norm": 15.941396290884985, | |
| "kl": 4.468831380208333, | |
| "learning_rate": 4.99998688809149e-07, | |
| "loss": 0.0794, | |
| "reward": 0.979166696468989, | |
| "reward_std": 0.5592605446775755, | |
| "rewards/equation_reward_func": 0.38194445582727593, | |
| "rewards/format_reward_func": 0.5972222338120142, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 247.0416742960612, | |
| "epoch": 0.35583596214511043, | |
| "grad_norm": 2.4432100547197657, | |
| "kl": 0.603515625, | |
| "learning_rate": 4.999976689956274e-07, | |
| "loss": 0.023, | |
| "reward": 1.1041666915019352, | |
| "reward_std": 0.5778869986534119, | |
| "rewards/equation_reward_func": 0.5138888973742723, | |
| "rewards/format_reward_func": 0.5902777935067812, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 248.41667302449545, | |
| "epoch": 0.35962145110410093, | |
| "grad_norm": 4.823568956607298, | |
| "kl": 1.6413167317708333, | |
| "learning_rate": 4.999963578088516e-07, | |
| "loss": 0.0856, | |
| "reward": 1.0694444874922435, | |
| "reward_std": 0.719012883802255, | |
| "rewards/equation_reward_func": 0.5069444564481577, | |
| "rewards/format_reward_func": 0.562500017384688, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 273.97917556762695, | |
| "epoch": 0.3634069400630915, | |
| "grad_norm": 2.743096965012267, | |
| "kl": 0.9237467447916666, | |
| "learning_rate": 4.999947552503497e-07, | |
| "loss": 0.1483, | |
| "reward": 1.1319444874922435, | |
| "reward_std": 0.6314157545566559, | |
| "rewards/equation_reward_func": 0.5208333457509676, | |
| "rewards/format_reward_func": 0.6111111268401146, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 246.41667556762695, | |
| "epoch": 0.36719242902208205, | |
| "grad_norm": 2.638981910331043, | |
| "kl": 0.8427327473958334, | |
| "learning_rate": 4.999928613219894e-07, | |
| "loss": 0.1078, | |
| "reward": 1.0625000471870105, | |
| "reward_std": 0.6069262598951658, | |
| "rewards/equation_reward_func": 0.4583333469927311, | |
| "rewards/format_reward_func": 0.6041666877766451, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 273.4652849833171, | |
| "epoch": 0.37097791798107255, | |
| "grad_norm": 3.1586256673049946, | |
| "kl": 0.6038411458333334, | |
| "learning_rate": 4.999906760259783e-07, | |
| "loss": 0.0848, | |
| "reward": 1.1944444874922435, | |
| "reward_std": 0.5770174351831278, | |
| "rewards/equation_reward_func": 0.548611123735706, | |
| "rewards/format_reward_func": 0.6458333482344946, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 279.87500762939453, | |
| "epoch": 0.3747634069400631, | |
| "grad_norm": 2.3306411923794284, | |
| "kl": 0.4184977213541667, | |
| "learning_rate": 4.999881993648632e-07, | |
| "loss": 0.1264, | |
| "reward": 1.1805555820465088, | |
| "reward_std": 0.5819496115048727, | |
| "rewards/equation_reward_func": 0.506944460173448, | |
| "rewards/format_reward_func": 0.6736111318071684, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 285.9930610656738, | |
| "epoch": 0.3785488958990536, | |
| "grad_norm": 2.9295320476594964, | |
| "kl": 0.7996622721354166, | |
| "learning_rate": 4.999854313415308e-07, | |
| "loss": 0.1193, | |
| "reward": 1.1388889302810032, | |
| "reward_std": 0.5301796098550161, | |
| "rewards/equation_reward_func": 0.39583334264655906, | |
| "rewards/format_reward_func": 0.7430555721124014, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 269.12500890096027, | |
| "epoch": 0.38233438485804416, | |
| "grad_norm": 2.6552410076798028, | |
| "kl": 1.19091796875, | |
| "learning_rate": 4.999823719592071e-07, | |
| "loss": 0.216, | |
| "reward": 1.2777778208255768, | |
| "reward_std": 0.5021173569063345, | |
| "rewards/equation_reward_func": 0.506944457689921, | |
| "rewards/format_reward_func": 0.7708333532015482, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 274.5069516499837, | |
| "epoch": 0.3861198738170347, | |
| "grad_norm": 5.1879891100690285, | |
| "kl": 2.9518229166666665, | |
| "learning_rate": 4.999790212214579e-07, | |
| "loss": 0.1756, | |
| "reward": 1.2430555870135624, | |
| "reward_std": 0.58441444983085, | |
| "rewards/equation_reward_func": 0.479166679084301, | |
| "rewards/format_reward_func": 0.7638889054457346, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 248.79167302449545, | |
| "epoch": 0.3899053627760252, | |
| "grad_norm": 3.1959715484572917, | |
| "kl": 1.0735677083333333, | |
| "learning_rate": 4.999753791321885e-07, | |
| "loss": 0.1732, | |
| "reward": 1.3750000496705372, | |
| "reward_std": 0.5170091787974039, | |
| "rewards/equation_reward_func": 0.5902777947485447, | |
| "rewards/format_reward_func": 0.7847222487131754, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 244.10417302449545, | |
| "epoch": 0.3936908517350158, | |
| "grad_norm": 14.582535270082563, | |
| "kl": 6.297200520833333, | |
| "learning_rate": 4.999714456956438e-07, | |
| "loss": 0.0727, | |
| "reward": 1.2986111442248027, | |
| "reward_std": 0.5151846868296465, | |
| "rewards/equation_reward_func": 0.5069444552063942, | |
| "rewards/format_reward_func": 0.7916666815678278, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 242.52778244018555, | |
| "epoch": 0.39747634069400634, | |
| "grad_norm": 151.19489080244557, | |
| "kl": 27.640625, | |
| "learning_rate": 4.99967220916408e-07, | |
| "loss": 0.0915, | |
| "reward": 1.3958333532015483, | |
| "reward_std": 0.48437386751174927, | |
| "rewards/equation_reward_func": 0.5486111175268888, | |
| "rewards/format_reward_func": 0.8472222437461218, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 185.7916717529297, | |
| "epoch": 0.40126182965299684, | |
| "grad_norm": 37.78796510721226, | |
| "kl": 9.738444010416666, | |
| "learning_rate": 4.999627047994053e-07, | |
| "loss": 0.0349, | |
| "reward": 1.4375000596046448, | |
| "reward_std": 0.48517493655284244, | |
| "rewards/equation_reward_func": 0.5763889048248529, | |
| "rewards/format_reward_func": 0.8611111342906952, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 248.83333841959634, | |
| "epoch": 0.4050473186119874, | |
| "grad_norm": 15.557449401743996, | |
| "kl": 1.7869466145833333, | |
| "learning_rate": 4.999578973498994e-07, | |
| "loss": 0.0905, | |
| "reward": 1.2916667064030964, | |
| "reward_std": 0.5043560986717542, | |
| "rewards/equation_reward_func": 0.4652777872979641, | |
| "rewards/format_reward_func": 0.8263889054457346, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 221.31945164998373, | |
| "epoch": 0.4088328075709779, | |
| "grad_norm": 2.746698671118404, | |
| "kl": 2.4767252604166665, | |
| "learning_rate": 4.999527985734931e-07, | |
| "loss": 0.1176, | |
| "reward": 1.3958333681027095, | |
| "reward_std": 0.4606535832087199, | |
| "rewards/equation_reward_func": 0.5486111280818781, | |
| "rewards/format_reward_func": 0.8472222437461218, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 265.2291742960612, | |
| "epoch": 0.41261829652996845, | |
| "grad_norm": 8.47115489110944, | |
| "kl": 2.67626953125, | |
| "learning_rate": 4.999474084761293e-07, | |
| "loss": 0.1801, | |
| "reward": 1.4375000496705372, | |
| "reward_std": 0.4704290193816026, | |
| "rewards/equation_reward_func": 0.5763889029622078, | |
| "rewards/format_reward_func": 0.8611111293236414, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 216.90278498331705, | |
| "epoch": 0.416403785488959, | |
| "grad_norm": 2.6419184728296528, | |
| "kl": 1.8723958333333333, | |
| "learning_rate": 4.999417270640898e-07, | |
| "loss": 0.0151, | |
| "reward": 1.3125000447034836, | |
| "reward_std": 0.5176352287332217, | |
| "rewards/equation_reward_func": 0.4513888967533906, | |
| "rewards/format_reward_func": 0.8611111342906952, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 237.16667556762695, | |
| "epoch": 0.4201892744479495, | |
| "grad_norm": 3.5632903162508525, | |
| "kl": 1.9169108072916667, | |
| "learning_rate": 4.999357543439968e-07, | |
| "loss": 0.2532, | |
| "reward": 1.3263889302810032, | |
| "reward_std": 0.46584198499719304, | |
| "rewards/equation_reward_func": 0.44444444961845875, | |
| "rewards/format_reward_func": 0.8819444676240286, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 250.10417048136392, | |
| "epoch": 0.42397476340694007, | |
| "grad_norm": 2.836432685345919, | |
| "kl": 2.11572265625, | |
| "learning_rate": 4.999294903228113e-07, | |
| "loss": 0.0877, | |
| "reward": 1.3541666964689891, | |
| "reward_std": 0.5378451521197954, | |
| "rewards/equation_reward_func": 0.5347222381581863, | |
| "rewards/format_reward_func": 0.8194444676240286, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 256.95139439900714, | |
| "epoch": 0.4277602523659306, | |
| "grad_norm": 15.018378327595181, | |
| "kl": 8.984842936197916, | |
| "learning_rate": 4.999229350078339e-07, | |
| "loss": 0.116, | |
| "reward": 1.4513889253139496, | |
| "reward_std": 0.4579727239906788, | |
| "rewards/equation_reward_func": 0.562500019868215, | |
| "rewards/format_reward_func": 0.8888889054457346, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 249.65973154703775, | |
| "epoch": 0.43154574132492113, | |
| "grad_norm": 2.9312130360707225, | |
| "kl": 1.6197916666666667, | |
| "learning_rate": 4.99916088406705e-07, | |
| "loss": 0.1031, | |
| "reward": 1.4722222586472828, | |
| "reward_std": 0.491986704369386, | |
| "rewards/equation_reward_func": 0.5763889017204443, | |
| "rewards/format_reward_func": 0.895833358168602, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 237.29167048136392, | |
| "epoch": 0.4353312302839117, | |
| "grad_norm": 318.50068999038837, | |
| "kl": 10.434326171875, | |
| "learning_rate": 4.999089505274044e-07, | |
| "loss": 0.073, | |
| "reward": 1.326388920346896, | |
| "reward_std": 0.42563923199971515, | |
| "rewards/equation_reward_func": 0.4375000149011612, | |
| "rewards/format_reward_func": 0.8888889153798422, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 247.90973027547201, | |
| "epoch": 0.4391167192429022, | |
| "grad_norm": 4.524241932647995, | |
| "kl": 1.73681640625, | |
| "learning_rate": 4.999015213782511e-07, | |
| "loss": 0.0973, | |
| "reward": 1.4375000496705372, | |
| "reward_std": 0.5439534323910872, | |
| "rewards/equation_reward_func": 0.6597222462296486, | |
| "rewards/format_reward_func": 0.7777777959903082, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 241.83334096272787, | |
| "epoch": 0.44290220820189274, | |
| "grad_norm": 2.1566548938944345, | |
| "kl": 6.7578125, | |
| "learning_rate": 4.998938009679042e-07, | |
| "loss": 0.0664, | |
| "reward": 1.4027778307596843, | |
| "reward_std": 0.5103383002181848, | |
| "rewards/equation_reward_func": 0.5972222362955412, | |
| "rewards/format_reward_func": 0.8055555820465088, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 265.61112213134766, | |
| "epoch": 0.4466876971608833, | |
| "grad_norm": 10.590567425193024, | |
| "kl": 1.2277018229166667, | |
| "learning_rate": 4.998857893053613e-07, | |
| "loss": 0.0824, | |
| "reward": 1.4166666964689891, | |
| "reward_std": 0.5057607839504877, | |
| "rewards/equation_reward_func": 0.5555555745959282, | |
| "rewards/format_reward_func": 0.8611111342906952, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 270.2361195882161, | |
| "epoch": 0.4504731861198738, | |
| "grad_norm": 2.807056262560773, | |
| "kl": 2.3780517578125, | |
| "learning_rate": 4.998774863999605e-07, | |
| "loss": 0.1143, | |
| "reward": 1.3888889302810032, | |
| "reward_std": 0.38816434393326443, | |
| "rewards/equation_reward_func": 0.5138889023413261, | |
| "rewards/format_reward_func": 0.8750000149011612, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 282.70834096272785, | |
| "epoch": 0.45425867507886436, | |
| "grad_norm": 3.1612686868215154, | |
| "kl": 1.0327962239583333, | |
| "learning_rate": 4.998688922613787e-07, | |
| "loss": 0.0685, | |
| "reward": 1.4305555919806163, | |
| "reward_std": 0.5549860845009486, | |
| "rewards/equation_reward_func": 0.6041666784634193, | |
| "rewards/format_reward_func": 0.8263889104127884, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 215.09722646077475, | |
| "epoch": 0.4580441640378549, | |
| "grad_norm": 2.621718223845123, | |
| "kl": 6.5284423828125, | |
| "learning_rate": 4.998600068996324e-07, | |
| "loss": 0.099, | |
| "reward": 1.319444477558136, | |
| "reward_std": 0.42932410165667534, | |
| "rewards/equation_reward_func": 0.5000000211099783, | |
| "rewards/format_reward_func": 0.8194444626569748, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 264.2569529215495, | |
| "epoch": 0.4618296529968454, | |
| "grad_norm": 4.352484643043419, | |
| "kl": 0.9919026692708334, | |
| "learning_rate": 4.998508303250775e-07, | |
| "loss": 0.0482, | |
| "reward": 1.48611115415891, | |
| "reward_std": 0.5396140466133753, | |
| "rewards/equation_reward_func": 0.6041666865348816, | |
| "rewards/format_reward_func": 0.8819444527228674, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 243.7986183166504, | |
| "epoch": 0.465615141955836, | |
| "grad_norm": 5.13544206444599, | |
| "kl": 2.6465657552083335, | |
| "learning_rate": 4.998413625484094e-07, | |
| "loss": 0.1093, | |
| "reward": 1.2500000298023224, | |
| "reward_std": 0.4689197850724061, | |
| "rewards/equation_reward_func": 0.43055556155741215, | |
| "rewards/format_reward_func": 0.8194444676240286, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 251.54167302449545, | |
| "epoch": 0.4694006309148265, | |
| "grad_norm": 2.679299182739803, | |
| "kl": 0.9150797526041666, | |
| "learning_rate": 4.998316035806628e-07, | |
| "loss": 0.1428, | |
| "reward": 1.3888889253139496, | |
| "reward_std": 0.47515800098578137, | |
| "rewards/equation_reward_func": 0.5486111318071684, | |
| "rewards/format_reward_func": 0.8402777910232544, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 268.2152837117513, | |
| "epoch": 0.47318611987381703, | |
| "grad_norm": 1.8553455629336253, | |
| "kl": 1.0334065755208333, | |
| "learning_rate": 4.998215534332118e-07, | |
| "loss": 0.0777, | |
| "reward": 1.4861111640930176, | |
| "reward_std": 0.40932964409391087, | |
| "rewards/equation_reward_func": 0.6250000124176344, | |
| "rewards/format_reward_func": 0.8611111293236414, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 267.91667556762695, | |
| "epoch": 0.4769716088328076, | |
| "grad_norm": 2.3104189096021246, | |
| "kl": 1.7921549479166667, | |
| "learning_rate": 4.998112121177698e-07, | |
| "loss": 0.0391, | |
| "reward": 1.3888889253139496, | |
| "reward_std": 0.4704259845117728, | |
| "rewards/equation_reward_func": 0.5138889048248529, | |
| "rewards/format_reward_func": 0.8750000149011612, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 244.9861183166504, | |
| "epoch": 0.4807570977917981, | |
| "grad_norm": 5.244107644705432, | |
| "kl": 3.0703328450520835, | |
| "learning_rate": 4.9980057964639e-07, | |
| "loss": 0.1144, | |
| "reward": 1.3680555870135624, | |
| "reward_std": 0.3652517894903819, | |
| "rewards/equation_reward_func": 0.4444444552063942, | |
| "rewards/format_reward_func": 0.9236111243565878, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 242.57639694213867, | |
| "epoch": 0.48454258675078865, | |
| "grad_norm": 3.0461805649624036, | |
| "kl": 0.55810546875, | |
| "learning_rate": 4.99789656031464e-07, | |
| "loss": 0.1251, | |
| "reward": 1.451388915379842, | |
| "reward_std": 0.43821969131628674, | |
| "rewards/equation_reward_func": 0.5416666828095913, | |
| "rewards/format_reward_func": 0.9097222437461218, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 268.93056360880536, | |
| "epoch": 0.48832807570977915, | |
| "grad_norm": 2.7875837232126814, | |
| "kl": 19.790120442708332, | |
| "learning_rate": 4.997784412857239e-07, | |
| "loss": 0.1328, | |
| "reward": 1.4444445073604584, | |
| "reward_std": 0.481424443423748, | |
| "rewards/equation_reward_func": 0.5625000136593977, | |
| "rewards/format_reward_func": 0.881944457689921, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 243.06944783528647, | |
| "epoch": 0.4921135646687697, | |
| "grad_norm": 3.3379512405703986, | |
| "kl": 2.3311360677083335, | |
| "learning_rate": 4.997669354222401e-07, | |
| "loss": 0.0831, | |
| "reward": 1.4444444874922435, | |
| "reward_std": 0.48848551760117215, | |
| "rewards/equation_reward_func": 0.5555555758376917, | |
| "rewards/format_reward_func": 0.8888889104127884, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 257.15972900390625, | |
| "epoch": 0.49589905362776027, | |
| "grad_norm": 5.920825260861832, | |
| "kl": 2.1470540364583335, | |
| "learning_rate": 4.99755138454423e-07, | |
| "loss": 0.0901, | |
| "reward": 1.4166667014360428, | |
| "reward_std": 0.40707051381468773, | |
| "rewards/equation_reward_func": 0.5208333519597849, | |
| "rewards/format_reward_func": 0.8958333532015482, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 275.7361208597819, | |
| "epoch": 0.49968454258675077, | |
| "grad_norm": 119.84927693026204, | |
| "kl": 16.074625651041668, | |
| "learning_rate": 4.997430503960219e-07, | |
| "loss": 0.1126, | |
| "reward": 1.4236111442248027, | |
| "reward_std": 0.44205466161171597, | |
| "rewards/equation_reward_func": 0.5347222350537777, | |
| "rewards/format_reward_func": 0.8888889104127884, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 283.8333396911621, | |
| "epoch": 0.5034700315457413, | |
| "grad_norm": 2.3654173997862147, | |
| "kl": 1.8214518229166667, | |
| "learning_rate": 4.997306712611255e-07, | |
| "loss": 0.1992, | |
| "reward": 1.4097222487131755, | |
| "reward_std": 0.4522901251912117, | |
| "rewards/equation_reward_func": 0.5416666778425375, | |
| "rewards/format_reward_func": 0.8680555721124014, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 255.3055648803711, | |
| "epoch": 0.5072555205047319, | |
| "grad_norm": 8.674419859591838, | |
| "kl": 1.6661783854166667, | |
| "learning_rate": 4.997180010641617e-07, | |
| "loss": 0.0642, | |
| "reward": 1.4236111640930176, | |
| "reward_std": 0.4788891275723775, | |
| "rewards/equation_reward_func": 0.5277777922650179, | |
| "rewards/format_reward_func": 0.8958333532015482, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 258.4166742960612, | |
| "epoch": 0.5110410094637224, | |
| "grad_norm": 4.561358423608036, | |
| "kl": 1.7342122395833333, | |
| "learning_rate": 4.997050398198976e-07, | |
| "loss": 0.008, | |
| "reward": 1.3125000496705372, | |
| "reward_std": 0.4775065655509631, | |
| "rewards/equation_reward_func": 0.39583334140479565, | |
| "rewards/format_reward_func": 0.9166666815678278, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 295.40278752644855, | |
| "epoch": 0.5148264984227129, | |
| "grad_norm": 3.607344267562202, | |
| "kl": 1.3319905598958333, | |
| "learning_rate": 4.996917875434397e-07, | |
| "loss": 0.0834, | |
| "reward": 1.36111115415891, | |
| "reward_std": 0.4735433558622996, | |
| "rewards/equation_reward_func": 0.46527778419355553, | |
| "rewards/format_reward_func": 0.8958333532015482, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 261.59722900390625, | |
| "epoch": 0.5186119873817034, | |
| "grad_norm": 2.5043328754943537, | |
| "kl": 1.7516276041666667, | |
| "learning_rate": 4.996782442502337e-07, | |
| "loss": 0.1104, | |
| "reward": 1.3750000496705372, | |
| "reward_std": 0.49132541194558144, | |
| "rewards/equation_reward_func": 0.49305557397504646, | |
| "rewards/format_reward_func": 0.8819444527228674, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 286.08334096272785, | |
| "epoch": 0.522397476340694, | |
| "grad_norm": 6.295893654204792, | |
| "kl": 4.499348958333333, | |
| "learning_rate": 4.996644099560641e-07, | |
| "loss": 0.1441, | |
| "reward": 1.4722222685813904, | |
| "reward_std": 0.5132550907631716, | |
| "rewards/equation_reward_func": 0.5833333507180214, | |
| "rewards/format_reward_func": 0.8888889153798422, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 273.93056360880536, | |
| "epoch": 0.5261829652996846, | |
| "grad_norm": 7.804266363603045, | |
| "kl": 1.0720621744791667, | |
| "learning_rate": 4.996502846770549e-07, | |
| "loss": 0.1438, | |
| "reward": 1.3402778059244156, | |
| "reward_std": 0.4476064319411914, | |
| "rewards/equation_reward_func": 0.4305555584530036, | |
| "rewards/format_reward_func": 0.9097222338120142, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 240.97222900390625, | |
| "epoch": 0.5299684542586751, | |
| "grad_norm": 224.30396185994454, | |
| "kl": 22.011637369791668, | |
| "learning_rate": 4.996358684296693e-07, | |
| "loss": 0.1255, | |
| "reward": 1.3680556019147236, | |
| "reward_std": 0.41704921424388885, | |
| "rewards/equation_reward_func": 0.45138889985779923, | |
| "rewards/format_reward_func": 0.9166666815678278, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 245.88889694213867, | |
| "epoch": 0.5337539432176656, | |
| "grad_norm": 2.9359173813915618, | |
| "kl": 4.063395182291667, | |
| "learning_rate": 4.996211612307092e-07, | |
| "loss": 0.1143, | |
| "reward": 1.3333333482344945, | |
| "reward_std": 0.4269623930255572, | |
| "rewards/equation_reward_func": 0.4791666803260644, | |
| "rewards/format_reward_func": 0.8541666865348816, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 288.9722264607747, | |
| "epoch": 0.5375394321766561, | |
| "grad_norm": 2.754515790547288, | |
| "kl": 1.142822265625, | |
| "learning_rate": 4.996061630973162e-07, | |
| "loss": 0.1758, | |
| "reward": 1.4722222636143367, | |
| "reward_std": 0.38450759773453075, | |
| "rewards/equation_reward_func": 0.5763889042039713, | |
| "rewards/format_reward_func": 0.8958333482344946, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 273.65278752644855, | |
| "epoch": 0.5413249211356467, | |
| "grad_norm": 3.6074116845363675, | |
| "kl": 62.014078776041664, | |
| "learning_rate": 4.995908740469706e-07, | |
| "loss": 0.2716, | |
| "reward": 1.3750000447034836, | |
| "reward_std": 0.4357808977365494, | |
| "rewards/equation_reward_func": 0.45833334140479565, | |
| "rewards/format_reward_func": 0.9166666815678278, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 254.61111958821616, | |
| "epoch": 0.5451104100946372, | |
| "grad_norm": 2.833549076551859, | |
| "kl": 0.9168701171875, | |
| "learning_rate": 4.995752940974918e-07, | |
| "loss": 0.1139, | |
| "reward": 1.4652778108914692, | |
| "reward_std": 0.500111423432827, | |
| "rewards/equation_reward_func": 0.5416666865348816, | |
| "rewards/format_reward_func": 0.9236111293236414, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 266.50000890096027, | |
| "epoch": 0.5488958990536278, | |
| "grad_norm": 2.0285819176753637, | |
| "kl": 0.7223714192708334, | |
| "learning_rate": 4.995594232670383e-07, | |
| "loss": 0.0795, | |
| "reward": 1.5000000298023224, | |
| "reward_std": 0.3858482278883457, | |
| "rewards/equation_reward_func": 0.5902777904023727, | |
| "rewards/format_reward_func": 0.909722238779068, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 242.50000953674316, | |
| "epoch": 0.5526813880126183, | |
| "grad_norm": 2.469130613713446, | |
| "kl": 7.028157552083333, | |
| "learning_rate": 4.995432615741076e-07, | |
| "loss": 0.0928, | |
| "reward": 1.5208333730697632, | |
| "reward_std": 0.3851733220120271, | |
| "rewards/equation_reward_func": 0.5972222313284874, | |
| "rewards/format_reward_func": 0.9236111293236414, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 256.7430610656738, | |
| "epoch": 0.5564668769716088, | |
| "grad_norm": 2.818264802652981, | |
| "kl": 0.8765869140625, | |
| "learning_rate": 4.995268090375362e-07, | |
| "loss": 0.134, | |
| "reward": 1.4930555919806163, | |
| "reward_std": 0.4773927927017212, | |
| "rewards/equation_reward_func": 0.6041666840513548, | |
| "rewards/format_reward_func": 0.8888889054457346, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 269.722230275472, | |
| "epoch": 0.5602523659305993, | |
| "grad_norm": 2.6290072054683082, | |
| "kl": 1.646728515625, | |
| "learning_rate": 4.995100656764996e-07, | |
| "loss": 0.111, | |
| "reward": 1.3402778108914692, | |
| "reward_std": 0.45711999386548996, | |
| "rewards/equation_reward_func": 0.430555568387111, | |
| "rewards/format_reward_func": 0.9097222437461218, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 283.94445419311523, | |
| "epoch": 0.5640378548895899, | |
| "grad_norm": 3.52286689241144, | |
| "kl": 1.5117594401041667, | |
| "learning_rate": 4.994930315105124e-07, | |
| "loss": 0.1291, | |
| "reward": 1.4722222586472828, | |
| "reward_std": 0.4221850348015626, | |
| "rewards/equation_reward_func": 0.5763889029622078, | |
| "rewards/format_reward_func": 0.8958333532015482, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 260.83334096272785, | |
| "epoch": 0.5678233438485805, | |
| "grad_norm": 3.048640806478669, | |
| "kl": 8.982340494791666, | |
| "learning_rate": 4.994757065594279e-07, | |
| "loss": 0.1167, | |
| "reward": 1.4236111442248027, | |
| "reward_std": 0.4365849755704403, | |
| "rewards/equation_reward_func": 0.534722238779068, | |
| "rewards/format_reward_func": 0.8888889104127884, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 251.4861208597819, | |
| "epoch": 0.571608832807571, | |
| "grad_norm": 5.880882126873241, | |
| "kl": 2.01953125, | |
| "learning_rate": 4.994580908434383e-07, | |
| "loss": 0.2153, | |
| "reward": 1.3750000298023224, | |
| "reward_std": 0.4684516203900178, | |
| "rewards/equation_reward_func": 0.47916667846341926, | |
| "rewards/format_reward_func": 0.8958333482344946, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 308.96528244018555, | |
| "epoch": 0.5753943217665615, | |
| "grad_norm": 7.940563386747667, | |
| "kl": 2.2464192708333335, | |
| "learning_rate": 4.994401843830749e-07, | |
| "loss": 0.2154, | |
| "reward": 1.2638889352480571, | |
| "reward_std": 0.516243410607179, | |
| "rewards/equation_reward_func": 0.41666668343047303, | |
| "rewards/format_reward_func": 0.8472222437461218, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 232.1180623372396, | |
| "epoch": 0.579179810725552, | |
| "grad_norm": 305.4523441721458, | |
| "kl": 29.108561197916668, | |
| "learning_rate": 4.994219871992076e-07, | |
| "loss": 0.2207, | |
| "reward": 1.4375000298023224, | |
| "reward_std": 0.45513641958435375, | |
| "rewards/equation_reward_func": 0.5277778009573618, | |
| "rewards/format_reward_func": 0.909722238779068, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 264.7777862548828, | |
| "epoch": 0.5829652996845426, | |
| "grad_norm": 2.0715581627005784, | |
| "kl": 1.160400390625, | |
| "learning_rate": 4.994034993130455e-07, | |
| "loss": 0.1089, | |
| "reward": 1.3958333730697632, | |
| "reward_std": 0.3590660902361075, | |
| "rewards/equation_reward_func": 0.47222223443289596, | |
| "rewards/format_reward_func": 0.9236111243565878, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 257.13195419311523, | |
| "epoch": 0.5867507886435331, | |
| "grad_norm": 3.5395487394835476, | |
| "kl": 1.2493489583333333, | |
| "learning_rate": 4.993847207461362e-07, | |
| "loss": 0.1119, | |
| "reward": 1.3194444924592972, | |
| "reward_std": 0.40260318542520207, | |
| "rewards/equation_reward_func": 0.4236111293236415, | |
| "rewards/format_reward_func": 0.8958333532015482, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 248.52778498331705, | |
| "epoch": 0.5905362776025237, | |
| "grad_norm": 3.948194000938186, | |
| "kl": 1.1299641927083333, | |
| "learning_rate": 4.993656515203662e-07, | |
| "loss": 0.1778, | |
| "reward": 1.3819444874922435, | |
| "reward_std": 0.39707954103748005, | |
| "rewards/equation_reward_func": 0.465277789781491, | |
| "rewards/format_reward_func": 0.9166666766007742, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 251.06250508626303, | |
| "epoch": 0.5943217665615141, | |
| "grad_norm": 3.3126376703610187, | |
| "kl": 1.5913899739583333, | |
| "learning_rate": 4.993462916579606e-07, | |
| "loss": 0.1415, | |
| "reward": 1.4027778059244156, | |
| "reward_std": 0.415769978115956, | |
| "rewards/equation_reward_func": 0.5069444589316845, | |
| "rewards/format_reward_func": 0.8958333532015482, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 267.7361195882161, | |
| "epoch": 0.5981072555205047, | |
| "grad_norm": 3.6712572603171045, | |
| "kl": 1.0328776041666667, | |
| "learning_rate": 4.993266411814837e-07, | |
| "loss": 0.1356, | |
| "reward": 1.5138889253139496, | |
| "reward_std": 0.43073243647813797, | |
| "rewards/equation_reward_func": 0.6180555745959282, | |
| "rewards/format_reward_func": 0.8958333482344946, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 260.7569516499837, | |
| "epoch": 0.6018927444794953, | |
| "grad_norm": 2.898300493316585, | |
| "kl": 1.694091796875, | |
| "learning_rate": 4.993067001138379e-07, | |
| "loss": 0.1933, | |
| "reward": 1.3958333830038707, | |
| "reward_std": 0.45616808036963147, | |
| "rewards/equation_reward_func": 0.5138889054457346, | |
| "rewards/format_reward_func": 0.881944457689921, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 243.38889439900717, | |
| "epoch": 0.6056782334384858, | |
| "grad_norm": 2.3012954584762206, | |
| "kl": 1.1136881510416667, | |
| "learning_rate": 4.992864684782648e-07, | |
| "loss": 0.0314, | |
| "reward": 1.423611159125964, | |
| "reward_std": 0.4477810760339101, | |
| "rewards/equation_reward_func": 0.4930555696288745, | |
| "rewards/format_reward_func": 0.9305555721124014, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 252.68750635782877, | |
| "epoch": 0.6094637223974764, | |
| "grad_norm": 10.558231881280353, | |
| "kl": 7.27197265625, | |
| "learning_rate": 4.992659462983445e-07, | |
| "loss": 0.1837, | |
| "reward": 1.4444444874922435, | |
| "reward_std": 0.4468059837818146, | |
| "rewards/equation_reward_func": 0.5416666797051827, | |
| "rewards/format_reward_func": 0.9027777959903082, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 223.90278244018555, | |
| "epoch": 0.6132492113564669, | |
| "grad_norm": 8.24083470994998, | |
| "kl": 1.4090983072916667, | |
| "learning_rate": 4.992451335979955e-07, | |
| "loss": 0.0984, | |
| "reward": 1.4513889253139496, | |
| "reward_std": 0.34703291207551956, | |
| "rewards/equation_reward_func": 0.5000000136593977, | |
| "rewards/format_reward_func": 0.951388900478681, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 246.9583396911621, | |
| "epoch": 0.6170347003154574, | |
| "grad_norm": 3.165272632330998, | |
| "kl": 1.4227701822916667, | |
| "learning_rate": 4.992240304014751e-07, | |
| "loss": 0.0434, | |
| "reward": 1.381944477558136, | |
| "reward_std": 0.3748237465818723, | |
| "rewards/equation_reward_func": 0.4583333457509677, | |
| "rewards/format_reward_func": 0.9236111243565878, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 248.64584477742514, | |
| "epoch": 0.6208201892744479, | |
| "grad_norm": 3.802073252683938, | |
| "kl": 1.0417887369791667, | |
| "learning_rate": 4.992026367333793e-07, | |
| "loss": 0.0662, | |
| "reward": 1.5347222487131755, | |
| "reward_std": 0.3855091730753581, | |
| "rewards/equation_reward_func": 0.6111111243565878, | |
| "rewards/format_reward_func": 0.9236111243565878, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 251.90278498331705, | |
| "epoch": 0.6246056782334385, | |
| "grad_norm": 3.1898348924774695, | |
| "kl": 1.318359375, | |
| "learning_rate": 4.991809526186423e-07, | |
| "loss": 0.1018, | |
| "reward": 1.4930555919806163, | |
| "reward_std": 0.4848398119211197, | |
| "rewards/equation_reward_func": 0.5694444676240286, | |
| "rewards/format_reward_func": 0.9236111293236414, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 270.7916781107585, | |
| "epoch": 0.628391167192429, | |
| "grad_norm": 48.65745253251759, | |
| "kl": 9.658447265625, | |
| "learning_rate": 4.991589780825373e-07, | |
| "loss": 0.2243, | |
| "reward": 1.6180556019147236, | |
| "reward_std": 0.38904641941189766, | |
| "rewards/equation_reward_func": 0.722222238779068, | |
| "rewards/format_reward_func": 0.8958333532015482, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 247.05556360880533, | |
| "epoch": 0.6321766561514196, | |
| "grad_norm": 12.85876415564074, | |
| "kl": 2.3059895833333335, | |
| "learning_rate": 4.991367131506753e-07, | |
| "loss": 0.0952, | |
| "reward": 1.4930555919806163, | |
| "reward_std": 0.44105598827203113, | |
| "rewards/equation_reward_func": 0.5902777935067812, | |
| "rewards/format_reward_func": 0.9027777959903082, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 252.79861958821616, | |
| "epoch": 0.63596214511041, | |
| "grad_norm": 3.766932765553029, | |
| "kl": 1.00732421875, | |
| "learning_rate": 4.991141578490066e-07, | |
| "loss": 0.108, | |
| "reward": 1.4305556019147236, | |
| "reward_std": 0.4160829931497574, | |
| "rewards/equation_reward_func": 0.5138888967533907, | |
| "rewards/format_reward_func": 0.9166666766007742, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 257.6805610656738, | |
| "epoch": 0.6397476340694006, | |
| "grad_norm": 8.363557603327017, | |
| "kl": 2.90673828125, | |
| "learning_rate": 4.990913122038193e-07, | |
| "loss": 0.0988, | |
| "reward": 1.506944477558136, | |
| "reward_std": 0.4711163180569808, | |
| "rewards/equation_reward_func": 0.5833333494762579, | |
| "rewards/format_reward_func": 0.9236111293236414, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 278.05556360880536, | |
| "epoch": 0.6435331230283912, | |
| "grad_norm": 2.4831862429823874, | |
| "kl": 1.1470540364583333, | |
| "learning_rate": 4.9906817624174e-07, | |
| "loss": 0.1149, | |
| "reward": 1.4583333780368168, | |
| "reward_std": 0.40201255182425183, | |
| "rewards/equation_reward_func": 0.5486111318071684, | |
| "rewards/format_reward_func": 0.9097222338120142, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 270.8125114440918, | |
| "epoch": 0.6473186119873817, | |
| "grad_norm": 96.69755111218885, | |
| "kl": 18.217529296875, | |
| "learning_rate": 4.990447499897339e-07, | |
| "loss": 0.1482, | |
| "reward": 1.4166666964689891, | |
| "reward_std": 0.4657805400590102, | |
| "rewards/equation_reward_func": 0.500000019868215, | |
| "rewards/format_reward_func": 0.9166666865348816, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 280.9513982137044, | |
| "epoch": 0.6511041009463723, | |
| "grad_norm": 4.4626269454999035, | |
| "kl": 1.0166829427083333, | |
| "learning_rate": 4.990210334751042e-07, | |
| "loss": 0.2191, | |
| "reward": 1.4305555919806163, | |
| "reward_std": 0.5064363280932108, | |
| "rewards/equation_reward_func": 0.5208333445092043, | |
| "rewards/format_reward_func": 0.909722238779068, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 290.0277862548828, | |
| "epoch": 0.6548895899053627, | |
| "grad_norm": 42.001704471875875, | |
| "kl": 7.866048177083333, | |
| "learning_rate": 4.989970267254928e-07, | |
| "loss": 0.3399, | |
| "reward": 1.37500003973643, | |
| "reward_std": 0.4621751358111699, | |
| "rewards/equation_reward_func": 0.5138889104127884, | |
| "rewards/format_reward_func": 0.8611111243565878, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 287.0277849833171, | |
| "epoch": 0.6586750788643533, | |
| "grad_norm": 401.0064206569611, | |
| "kl": 13.825358072916666, | |
| "learning_rate": 4.989727297688796e-07, | |
| "loss": 0.2614, | |
| "reward": 1.4930555919806163, | |
| "reward_std": 0.48149604598681134, | |
| "rewards/equation_reward_func": 0.6319444614152113, | |
| "rewards/format_reward_func": 0.8611111342906952, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 267.2222277323405, | |
| "epoch": 0.6624605678233438, | |
| "grad_norm": 5.58193017827173, | |
| "kl": 1.5638020833333333, | |
| "learning_rate": 4.989481426335828e-07, | |
| "loss": 0.2184, | |
| "reward": 1.4791667064030964, | |
| "reward_std": 0.32900576541821164, | |
| "rewards/equation_reward_func": 0.583333345130086, | |
| "rewards/format_reward_func": 0.8958333482344946, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 312.2222315470378, | |
| "epoch": 0.6662460567823344, | |
| "grad_norm": 2.903611804665768, | |
| "kl": 1.7395833333333333, | |
| "learning_rate": 4.989232653482587e-07, | |
| "loss": 0.2021, | |
| "reward": 1.4305555919806163, | |
| "reward_std": 0.4162732983628909, | |
| "rewards/equation_reward_func": 0.5486111268401146, | |
| "rewards/format_reward_func": 0.8819444676240286, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 295.96528752644855, | |
| "epoch": 0.670031545741325, | |
| "grad_norm": 8.614948807031883, | |
| "kl": 1.4444986979166667, | |
| "learning_rate": 4.98898097941902e-07, | |
| "loss": 0.2504, | |
| "reward": 1.3194444825251896, | |
| "reward_std": 0.3698546774685383, | |
| "rewards/equation_reward_func": 0.43750001179675263, | |
| "rewards/format_reward_func": 0.881944457689921, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 296.6111208597819, | |
| "epoch": 0.6738170347003155, | |
| "grad_norm": 18.17568858303832, | |
| "kl": 4.43408203125, | |
| "learning_rate": 4.988726404438453e-07, | |
| "loss": 0.2654, | |
| "reward": 1.2569444924592972, | |
| "reward_std": 0.5792658850550652, | |
| "rewards/equation_reward_func": 0.43750001303851604, | |
| "rewards/format_reward_func": 0.8194444626569748, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 297.1527862548828, | |
| "epoch": 0.677602523659306, | |
| "grad_norm": 3.3997948870685444, | |
| "kl": 2.4781901041666665, | |
| "learning_rate": 4.988468928837595e-07, | |
| "loss": 0.2077, | |
| "reward": 1.4027778307596843, | |
| "reward_std": 0.43186015884081524, | |
| "rewards/equation_reward_func": 0.5625000142802795, | |
| "rewards/format_reward_func": 0.8402777959903082, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 259.7013969421387, | |
| "epoch": 0.6813880126182965, | |
| "grad_norm": 3.261696651794849, | |
| "kl": 2.1082763671875, | |
| "learning_rate": 4.988208552916535e-07, | |
| "loss": 0.1781, | |
| "reward": 1.388888920346896, | |
| "reward_std": 0.4762779163817565, | |
| "rewards/equation_reward_func": 0.5069444607943296, | |
| "rewards/format_reward_func": 0.881944457689921, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 285.75695673624676, | |
| "epoch": 0.6851735015772871, | |
| "grad_norm": 436.9538386873056, | |
| "kl": 90.0078125, | |
| "learning_rate": 4.987945276978741e-07, | |
| "loss": 0.6442, | |
| "reward": 1.2361111342906952, | |
| "reward_std": 0.47308399528265, | |
| "rewards/equation_reward_func": 0.3888889054457347, | |
| "rewards/format_reward_func": 0.8472222437461218, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 253.26389821370444, | |
| "epoch": 0.6889589905362776, | |
| "grad_norm": 6.275698981154313, | |
| "kl": 1.6064453125, | |
| "learning_rate": 4.987679101331063e-07, | |
| "loss": 0.2335, | |
| "reward": 1.4861111442248027, | |
| "reward_std": 0.4897613674402237, | |
| "rewards/equation_reward_func": 0.6041666890184084, | |
| "rewards/format_reward_func": 0.8819444626569748, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 225.61111958821616, | |
| "epoch": 0.6927444794952682, | |
| "grad_norm": 5.577871979120166, | |
| "kl": 0.7556966145833334, | |
| "learning_rate": 4.987410026283729e-07, | |
| "loss": 0.1068, | |
| "reward": 1.48611115415891, | |
| "reward_std": 0.5080769136548042, | |
| "rewards/equation_reward_func": 0.5763889054457346, | |
| "rewards/format_reward_func": 0.9097222437461218, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 208.28472900390625, | |
| "epoch": 0.6965299684542586, | |
| "grad_norm": 34.81645021530138, | |
| "kl": 5.219563802083333, | |
| "learning_rate": 4.98713805215035e-07, | |
| "loss": 0.1549, | |
| "reward": 1.4583333830038707, | |
| "reward_std": 0.40722255781292915, | |
| "rewards/equation_reward_func": 0.5625000142802795, | |
| "rewards/format_reward_func": 0.8958333532015482, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 267.7986183166504, | |
| "epoch": 0.7003154574132492, | |
| "grad_norm": 10.388065090744742, | |
| "kl": 10.697916666666666, | |
| "learning_rate": 4.986863179247908e-07, | |
| "loss": 0.1906, | |
| "reward": 1.3750000447034836, | |
| "reward_std": 0.47181837012370426, | |
| "rewards/equation_reward_func": 0.5069444558272759, | |
| "rewards/format_reward_func": 0.8680555721124014, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 229.8611157735189, | |
| "epoch": 0.7041009463722397, | |
| "grad_norm": 4.562876059825846, | |
| "kl": 4.035807291666667, | |
| "learning_rate": 4.986585407896771e-07, | |
| "loss": 0.223, | |
| "reward": 1.4027778208255768, | |
| "reward_std": 0.5173191850384077, | |
| "rewards/equation_reward_func": 0.5486111293236414, | |
| "rewards/format_reward_func": 0.854166696468989, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 229.1666742960612, | |
| "epoch": 0.7078864353312303, | |
| "grad_norm": 6.466655997110351, | |
| "kl": 758.0651041666666, | |
| "learning_rate": 4.986304738420683e-07, | |
| "loss": 0.4869, | |
| "reward": 1.4305555820465088, | |
| "reward_std": 0.4751903774837653, | |
| "rewards/equation_reward_func": 0.5763888955116272, | |
| "rewards/format_reward_func": 0.8541666865348816, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 251.9236208597819, | |
| "epoch": 0.7116719242902209, | |
| "grad_norm": 49.29790482270018, | |
| "kl": 13.262369791666666, | |
| "learning_rate": 4.986021171146764e-07, | |
| "loss": 0.3513, | |
| "reward": 1.354166716337204, | |
| "reward_std": 0.5414688164989153, | |
| "rewards/equation_reward_func": 0.5138889054457346, | |
| "rewards/format_reward_func": 0.8402777959903082, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 254.1666774749756, | |
| "epoch": 0.7154574132492113, | |
| "grad_norm": 5.643615815413666, | |
| "kl": 7.41162109375, | |
| "learning_rate": 4.985734706405516e-07, | |
| "loss": 0.2591, | |
| "reward": 1.2777778059244156, | |
| "reward_std": 0.4625398740172386, | |
| "rewards/equation_reward_func": 0.4513889004786809, | |
| "rewards/format_reward_func": 0.8263889203468958, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 289.9583396911621, | |
| "epoch": 0.7192429022082019, | |
| "grad_norm": 304.8418060986503, | |
| "kl": 665.8196614583334, | |
| "learning_rate": 4.98544534453081e-07, | |
| "loss": 1.0021, | |
| "reward": 1.2708333830038707, | |
| "reward_std": 0.4970496619741122, | |
| "rewards/equation_reward_func": 0.534722234432896, | |
| "rewards/format_reward_func": 0.736111139257749, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 247.37500635782877, | |
| "epoch": 0.7230283911671924, | |
| "grad_norm": 11.586701386430356, | |
| "kl": 8.091145833333334, | |
| "learning_rate": 4.985153085859902e-07, | |
| "loss": 0.2491, | |
| "reward": 1.43750003973643, | |
| "reward_std": 0.5147989491621653, | |
| "rewards/equation_reward_func": 0.6458333432674408, | |
| "rewards/format_reward_func": 0.7916666865348816, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 264.2847315470378, | |
| "epoch": 0.726813880126183, | |
| "grad_norm": 9.752593632001947, | |
| "kl": 11.559244791666666, | |
| "learning_rate": 4.984857930733419e-07, | |
| "loss": 0.3493, | |
| "reward": 1.1111111392577488, | |
| "reward_std": 0.47952866181731224, | |
| "rewards/equation_reward_func": 0.347222230086724, | |
| "rewards/format_reward_func": 0.7638889104127884, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 269.0902849833171, | |
| "epoch": 0.7305993690851735, | |
| "grad_norm": 9.316145758908815, | |
| "kl": 11.126953125, | |
| "learning_rate": 4.984559879495366e-07, | |
| "loss": 0.3237, | |
| "reward": 1.201388920346896, | |
| "reward_std": 0.6368941242496172, | |
| "rewards/equation_reward_func": 0.4861111231148243, | |
| "rewards/format_reward_func": 0.7152778009573618, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 255.90972900390625, | |
| "epoch": 0.7343848580441641, | |
| "grad_norm": 8.827779574351993, | |
| "kl": 121.453125, | |
| "learning_rate": 4.984258932493123e-07, | |
| "loss": 0.5189, | |
| "reward": 1.2638889352480571, | |
| "reward_std": 0.5239984119931856, | |
| "rewards/equation_reward_func": 0.5555555671453476, | |
| "rewards/format_reward_func": 0.708333358168602, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 237.27084159851074, | |
| "epoch": 0.7381703470031545, | |
| "grad_norm": 8.013012272149158, | |
| "kl": 24.984375, | |
| "learning_rate": 4.983955090077444e-07, | |
| "loss": 0.2832, | |
| "reward": 1.1597222437461217, | |
| "reward_std": 0.5535530770818392, | |
| "rewards/equation_reward_func": 0.48611112497746944, | |
| "rewards/format_reward_func": 0.6736111268401146, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 249.63889821370444, | |
| "epoch": 0.7419558359621451, | |
| "grad_norm": 438.51211315014166, | |
| "kl": 126.08072916666667, | |
| "learning_rate": 4.983648352602459e-07, | |
| "loss": 0.3395, | |
| "reward": 1.1250000298023224, | |
| "reward_std": 0.6015344088276228, | |
| "rewards/equation_reward_func": 0.4930555646618207, | |
| "rewards/format_reward_func": 0.6319444750746092, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 195.81250508626303, | |
| "epoch": 0.7457413249211357, | |
| "grad_norm": 13.870204564822584, | |
| "kl": 9.074869791666666, | |
| "learning_rate": 4.983338720425672e-07, | |
| "loss": 0.2873, | |
| "reward": 1.1805555770794551, | |
| "reward_std": 0.6060735906163851, | |
| "rewards/equation_reward_func": 0.4583333407839139, | |
| "rewards/format_reward_func": 0.7222222437461218, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 254.87500699361166, | |
| "epoch": 0.7495268138801262, | |
| "grad_norm": 82.14394465970908, | |
| "kl": 38.481770833333336, | |
| "learning_rate": 4.98302619390796e-07, | |
| "loss": 0.3067, | |
| "reward": 1.1250000298023224, | |
| "reward_std": 0.4945492781698704, | |
| "rewards/equation_reward_func": 0.5277777904023727, | |
| "rewards/format_reward_func": 0.5972222425043583, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 221.51389439900717, | |
| "epoch": 0.7533123028391168, | |
| "grad_norm": 17.556835883262877, | |
| "kl": 97.25, | |
| "learning_rate": 4.982710773413576e-07, | |
| "loss": 0.3719, | |
| "reward": 1.131944477558136, | |
| "reward_std": 0.588702150930961, | |
| "rewards/equation_reward_func": 0.5763889079292616, | |
| "rewards/format_reward_func": 0.555555577079455, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 197.82639376322427, | |
| "epoch": 0.7570977917981072, | |
| "grad_norm": 26.30378944955965, | |
| "kl": 17.8984375, | |
| "learning_rate": 4.98239245931014e-07, | |
| "loss": 0.3139, | |
| "reward": 1.1805555870135624, | |
| "reward_std": 0.5916161189476649, | |
| "rewards/equation_reward_func": 0.5902777959903082, | |
| "rewards/format_reward_func": 0.5902777959903082, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 222.54861704508463, | |
| "epoch": 0.7608832807570978, | |
| "grad_norm": 10.696318069471166, | |
| "kl": 14.2109375, | |
| "learning_rate": 4.982071251968652e-07, | |
| "loss": 0.2388, | |
| "reward": 1.1041666964689891, | |
| "reward_std": 0.5821270644664764, | |
| "rewards/equation_reward_func": 0.5069444638987383, | |
| "rewards/format_reward_func": 0.5972222437461218, | |
| "step": 402 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 219.2916742960612, | |
| "epoch": 0.7646687697160883, | |
| "grad_norm": 132.22587525968703, | |
| "kl": 40.453125, | |
| "learning_rate": 4.981747151763478e-07, | |
| "loss": 0.2509, | |
| "reward": 1.0208333631356556, | |
| "reward_std": 0.6254869078596433, | |
| "rewards/equation_reward_func": 0.493055568387111, | |
| "rewards/format_reward_func": 0.5277777959903082, | |
| "step": 404 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 213.25000445048013, | |
| "epoch": 0.7684542586750789, | |
| "grad_norm": 53.75808201656059, | |
| "kl": 28.166666666666668, | |
| "learning_rate": 4.981420159072359e-07, | |
| "loss": 0.3216, | |
| "reward": 0.923611139257749, | |
| "reward_std": 0.5980016005535921, | |
| "rewards/equation_reward_func": 0.39583334513008595, | |
| "rewards/format_reward_func": 0.5277777860562006, | |
| "step": 406 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 255.5486208597819, | |
| "epoch": 0.7722397476340694, | |
| "grad_norm": 76.3126195344439, | |
| "kl": 24.140625, | |
| "learning_rate": 4.981090274276405e-07, | |
| "loss": 0.2661, | |
| "reward": 1.0833333681027095, | |
| "reward_std": 0.6427489096919695, | |
| "rewards/equation_reward_func": 0.5833333482344946, | |
| "rewards/format_reward_func": 0.500000019868215, | |
| "step": 408 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 223.21528244018555, | |
| "epoch": 0.7760252365930599, | |
| "grad_norm": 13589.418456534844, | |
| "kl": 1149.7135416666667, | |
| "learning_rate": 4.9807574977601e-07, | |
| "loss": 2.3024, | |
| "reward": 0.9375000447034836, | |
| "reward_std": 0.6095106812814871, | |
| "rewards/equation_reward_func": 0.42361112497746944, | |
| "rewards/format_reward_func": 0.5138889029622078, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 227.54167366027832, | |
| "epoch": 0.7798107255520504, | |
| "grad_norm": 33.78184675982937, | |
| "kl": 29.5703125, | |
| "learning_rate": 4.980421829911295e-07, | |
| "loss": 0.269, | |
| "reward": 0.8541667014360428, | |
| "reward_std": 0.6479750176270803, | |
| "rewards/equation_reward_func": 0.43055556900799274, | |
| "rewards/format_reward_func": 0.4236111268401146, | |
| "step": 412 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 274.4166742960612, | |
| "epoch": 0.783596214511041, | |
| "grad_norm": 17.81633266386669, | |
| "kl": 28.666666666666668, | |
| "learning_rate": 4.980083271121214e-07, | |
| "loss": 0.3345, | |
| "reward": 0.909722238779068, | |
| "reward_std": 0.6108483547965685, | |
| "rewards/equation_reward_func": 0.5277777959903082, | |
| "rewards/format_reward_func": 0.3819444576899211, | |
| "step": 414 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 251.74306106567383, | |
| "epoch": 0.7873817034700316, | |
| "grad_norm": 22.608432736994907, | |
| "kl": 51.177083333333336, | |
| "learning_rate": 4.979741821784445e-07, | |
| "loss": 0.2628, | |
| "reward": 0.8680555870135626, | |
| "reward_std": 0.6757829288641611, | |
| "rewards/equation_reward_func": 0.4583333457509677, | |
| "rewards/format_reward_func": 0.40972222822407883, | |
| "step": 416 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 265.33334096272785, | |
| "epoch": 0.7911671924290221, | |
| "grad_norm": 13.10404823539201, | |
| "kl": 27.53125, | |
| "learning_rate": 4.979397482298952e-07, | |
| "loss": 0.3222, | |
| "reward": 0.7916666939854622, | |
| "reward_std": 0.619778610765934, | |
| "rewards/equation_reward_func": 0.38194445582727593, | |
| "rewards/format_reward_func": 0.4097222313284874, | |
| "step": 418 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 245.4513931274414, | |
| "epoch": 0.7949526813880127, | |
| "grad_norm": 17.285612572481327, | |
| "kl": 25.333333333333332, | |
| "learning_rate": 4.979050253066063e-07, | |
| "loss": 0.2375, | |
| "reward": 0.937500019868215, | |
| "reward_std": 0.5681246320406595, | |
| "rewards/equation_reward_func": 0.5000000136593977, | |
| "rewards/format_reward_func": 0.4375000074505806, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 247.6736183166504, | |
| "epoch": 0.7987381703470031, | |
| "grad_norm": 114.10864728746037, | |
| "kl": 68.94791666666667, | |
| "learning_rate": 4.978700134490473e-07, | |
| "loss": 0.3221, | |
| "reward": 0.9861111293236414, | |
| "reward_std": 0.6230639989177386, | |
| "rewards/equation_reward_func": 0.4791666778425376, | |
| "rewards/format_reward_func": 0.5069444589316845, | |
| "step": 422 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 268.19445037841797, | |
| "epoch": 0.8025236593059937, | |
| "grad_norm": 27.278356050728746, | |
| "kl": 63.755208333333336, | |
| "learning_rate": 4.97834712698025e-07, | |
| "loss": 0.3404, | |
| "reward": 0.9027778077870607, | |
| "reward_std": 0.6374689054985841, | |
| "rewards/equation_reward_func": 0.5208333469927311, | |
| "rewards/format_reward_func": 0.3819444514811039, | |
| "step": 424 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 261.0486183166504, | |
| "epoch": 0.8063091482649842, | |
| "grad_norm": 112.26698272573795, | |
| "kl": 125.875, | |
| "learning_rate": 4.977991230946823e-07, | |
| "loss": 0.3086, | |
| "reward": 0.9791666915019354, | |
| "reward_std": 0.6475708857178688, | |
| "rewards/equation_reward_func": 0.5763889091710249, | |
| "rewards/format_reward_func": 0.40277778916060925, | |
| "step": 426 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 304.7708435058594, | |
| "epoch": 0.8100946372239748, | |
| "grad_norm": 88.42317906709971, | |
| "kl": 145.625, | |
| "learning_rate": 4.977632446804992e-07, | |
| "loss": 0.3789, | |
| "reward": 0.784722238779068, | |
| "reward_std": 0.6482410331567129, | |
| "rewards/equation_reward_func": 0.451388909171025, | |
| "rewards/format_reward_func": 0.33333334388832253, | |
| "step": 428 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 276.3194529215495, | |
| "epoch": 0.8138801261829653, | |
| "grad_norm": 57.008405478973984, | |
| "kl": 90.27083333333333, | |
| "learning_rate": 4.97727077497292e-07, | |
| "loss": 0.3829, | |
| "reward": 0.8888889054457346, | |
| "reward_std": 0.58370058486859, | |
| "rewards/equation_reward_func": 0.5763889079292616, | |
| "rewards/format_reward_func": 0.3125000074505806, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 274.0277837117513, | |
| "epoch": 0.8176656151419558, | |
| "grad_norm": 41.9648702558559, | |
| "kl": 93.64973958333333, | |
| "learning_rate": 4.976906215872137e-07, | |
| "loss": 0.2295, | |
| "reward": 0.8263889054457346, | |
| "reward_std": 0.6093253418803215, | |
| "rewards/equation_reward_func": 0.4861111243565877, | |
| "rewards/format_reward_func": 0.34027778419355553, | |
| "step": 432 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 366.2569580078125, | |
| "epoch": 0.8214511041009463, | |
| "grad_norm": 23.70562238362353, | |
| "kl": 49.619791666666664, | |
| "learning_rate": 4.976538769927538e-07, | |
| "loss": 0.2481, | |
| "reward": 0.5763888992369175, | |
| "reward_std": 0.6349846472342809, | |
| "rewards/equation_reward_func": 0.3541666728754838, | |
| "rewards/format_reward_func": 0.2222222244987885, | |
| "step": 434 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 322.51389439900714, | |
| "epoch": 0.8252365930599369, | |
| "grad_norm": 173.94515224795077, | |
| "kl": 55.354166666666664, | |
| "learning_rate": 4.976168437567384e-07, | |
| "loss": 0.2866, | |
| "reward": 0.7361111330489317, | |
| "reward_std": 0.5293329904476801, | |
| "rewards/equation_reward_func": 0.4652777922650178, | |
| "rewards/format_reward_func": 0.2708333395421505, | |
| "step": 436 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 370.1041781107585, | |
| "epoch": 0.8290220820189275, | |
| "grad_norm": 49.42785489603644, | |
| "kl": 38.208333333333336, | |
| "learning_rate": 4.975795219223298e-07, | |
| "loss": 0.2725, | |
| "reward": 0.6250000161429247, | |
| "reward_std": 0.6038348153233528, | |
| "rewards/equation_reward_func": 0.39583334513008595, | |
| "rewards/format_reward_func": 0.2291666710128387, | |
| "step": 438 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 358.9305674235026, | |
| "epoch": 0.832807570977918, | |
| "grad_norm": 63.93647805172191, | |
| "kl": 38.565104166666664, | |
| "learning_rate": 4.975419115330267e-07, | |
| "loss": 0.2397, | |
| "reward": 0.6388889079292616, | |
| "reward_std": 0.5783760311702887, | |
| "rewards/equation_reward_func": 0.40972224312524, | |
| "rewards/format_reward_func": 0.2291666722546021, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 343.5486208597819, | |
| "epoch": 0.8365930599369085, | |
| "grad_norm": 18.35613118498554, | |
| "kl": 40.0, | |
| "learning_rate": 4.975040126326641e-07, | |
| "loss": 0.3459, | |
| "reward": 0.7291666766007742, | |
| "reward_std": 0.6189329201976458, | |
| "rewards/equation_reward_func": 0.46527778916060925, | |
| "rewards/format_reward_func": 0.2638888967533906, | |
| "step": 442 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 334.00001017252606, | |
| "epoch": 0.840378548895899, | |
| "grad_norm": 61.315133397385836, | |
| "kl": 64.54166666666667, | |
| "learning_rate": 4.974658252654134e-07, | |
| "loss": 0.3642, | |
| "reward": 0.6111111268401146, | |
| "reward_std": 0.6266890317201614, | |
| "rewards/equation_reward_func": 0.3402777823309104, | |
| "rewards/format_reward_func": 0.27083334140479565, | |
| "step": 444 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 287.41667556762695, | |
| "epoch": 0.8441640378548896, | |
| "grad_norm": 111.02963258009683, | |
| "kl": 73.3125, | |
| "learning_rate": 4.974273494757822e-07, | |
| "loss": 0.2892, | |
| "reward": 0.736111139257749, | |
| "reward_std": 0.5954531555374464, | |
| "rewards/equation_reward_func": 0.430555568387111, | |
| "rewards/format_reward_func": 0.3055555609365304, | |
| "step": 446 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 315.1458447774251, | |
| "epoch": 0.8479495268138801, | |
| "grad_norm": 416.3212915048579, | |
| "kl": 112.47135416666667, | |
| "learning_rate": 4.973885853086141e-07, | |
| "loss": 0.3557, | |
| "reward": 0.7083333532015482, | |
| "reward_std": 0.595863493780295, | |
| "rewards/equation_reward_func": 0.43750001055498916, | |
| "rewards/format_reward_func": 0.27083333892126876, | |
| "step": 448 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 282.88195419311523, | |
| "epoch": 0.8517350157728707, | |
| "grad_norm": 84.57887686537714, | |
| "kl": 97.375, | |
| "learning_rate": 4.973495328090889e-07, | |
| "loss": 0.4201, | |
| "reward": 0.5625000124176344, | |
| "reward_std": 0.6184229714175066, | |
| "rewards/equation_reward_func": 0.24305556466182074, | |
| "rewards/format_reward_func": 0.31944445334374905, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 340.87501271565753, | |
| "epoch": 0.8555205047318613, | |
| "grad_norm": 139.9178717709, | |
| "kl": 92.39583333333333, | |
| "learning_rate": 4.973101920227225e-07, | |
| "loss": 0.3206, | |
| "reward": 0.5555555683871111, | |
| "reward_std": 0.6198337351282438, | |
| "rewards/equation_reward_func": 0.3263888955116272, | |
| "rewards/format_reward_func": 0.22916667287548384, | |
| "step": 452 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 351.2916768391927, | |
| "epoch": 0.8593059936908517, | |
| "grad_norm": 168.90078404576994, | |
| "kl": 58.34375, | |
| "learning_rate": 4.972705629953667e-07, | |
| "loss": 0.3032, | |
| "reward": 0.7083333482344946, | |
| "reward_std": 0.6670572757720947, | |
| "rewards/equation_reward_func": 0.395833349476258, | |
| "rewards/format_reward_func": 0.3125000074505806, | |
| "step": 454 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 346.7777913411458, | |
| "epoch": 0.8630914826498423, | |
| "grad_norm": 53.736984247481196, | |
| "kl": 71.42708333333333, | |
| "learning_rate": 4.97230645773209e-07, | |
| "loss": 0.3515, | |
| "reward": 0.6180555665244659, | |
| "reward_std": 0.5822310447692871, | |
| "rewards/equation_reward_func": 0.3680555621782939, | |
| "rewards/format_reward_func": 0.25000000682969886, | |
| "step": 456 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 329.21528879801434, | |
| "epoch": 0.8668769716088328, | |
| "grad_norm": 61.81724196878047, | |
| "kl": 71.97395833333333, | |
| "learning_rate": 4.971904404027736e-07, | |
| "loss": 0.3712, | |
| "reward": 0.5972222362955412, | |
| "reward_std": 0.6221836258967718, | |
| "rewards/equation_reward_func": 0.34722223319113255, | |
| "rewards/format_reward_func": 0.2500000062088172, | |
| "step": 458 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 400.7083511352539, | |
| "epoch": 0.8706624605678234, | |
| "grad_norm": 89.56376909680318, | |
| "kl": 93.92708333333333, | |
| "learning_rate": 4.971499469309197e-07, | |
| "loss": 0.3209, | |
| "reward": 0.5486111330489317, | |
| "reward_std": 0.5003731027245522, | |
| "rewards/equation_reward_func": 0.3611111169060071, | |
| "rewards/format_reward_func": 0.18750000558793545, | |
| "step": 460 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 379.7430674235026, | |
| "epoch": 0.8744479495268139, | |
| "grad_norm": 96.80934872874563, | |
| "kl": 71.47916666666667, | |
| "learning_rate": 4.971091654048427e-07, | |
| "loss": 0.2863, | |
| "reward": 0.4166666828095913, | |
| "reward_std": 0.5312095309297243, | |
| "rewards/equation_reward_func": 0.26388889489074546, | |
| "rewards/format_reward_func": 0.15277778233091036, | |
| "step": 462 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 379.1250063578288, | |
| "epoch": 0.8782334384858044, | |
| "grad_norm": 313.3277909671654, | |
| "kl": 157.80208333333334, | |
| "learning_rate": 4.970680958720733e-07, | |
| "loss": 0.5211, | |
| "reward": 0.48611112746099633, | |
| "reward_std": 0.5439305094381174, | |
| "rewards/equation_reward_func": 0.2222222313284874, | |
| "rewards/format_reward_func": 0.2638888992369175, | |
| "step": 464 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 365.0416742960612, | |
| "epoch": 0.8820189274447949, | |
| "grad_norm": 121.11532507159346, | |
| "kl": 131.42708333333334, | |
| "learning_rate": 4.970267383804787e-07, | |
| "loss": 0.4011, | |
| "reward": 0.4375000149011612, | |
| "reward_std": 0.5364614203572273, | |
| "rewards/equation_reward_func": 0.28472223070760566, | |
| "rewards/format_reward_func": 0.1527777792265018, | |
| "step": 466 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 354.7916742960612, | |
| "epoch": 0.8858044164037855, | |
| "grad_norm": 90.17803998443027, | |
| "kl": 152.9375, | |
| "learning_rate": 4.96985092978261e-07, | |
| "loss": 0.4152, | |
| "reward": 0.4305555696288745, | |
| "reward_std": 0.5253821363051733, | |
| "rewards/equation_reward_func": 0.2847222325702508, | |
| "rewards/format_reward_func": 0.14583333457509676, | |
| "step": 468 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 281.70834223429364, | |
| "epoch": 0.889589905362776, | |
| "grad_norm": 90.52105948028516, | |
| "kl": 115.625, | |
| "learning_rate": 4.969431597139581e-07, | |
| "loss": 0.2493, | |
| "reward": 0.5000000074505806, | |
| "reward_std": 0.6266848891973495, | |
| "rewards/equation_reward_func": 0.3472222338120143, | |
| "rewards/format_reward_func": 0.15277778171002865, | |
| "step": 470 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 323.0347315470378, | |
| "epoch": 0.8933753943217666, | |
| "grad_norm": 113.45003802315175, | |
| "kl": 83.2734375, | |
| "learning_rate": 4.969009386364433e-07, | |
| "loss": 0.3054, | |
| "reward": 0.4861111131807168, | |
| "reward_std": 0.581800473233064, | |
| "rewards/equation_reward_func": 0.2986111218730609, | |
| "rewards/format_reward_func": 0.1875000068296989, | |
| "step": 472 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 427.35418065388996, | |
| "epoch": 0.897160883280757, | |
| "grad_norm": 28.936753072783624, | |
| "kl": 65.74479166666667, | |
| "learning_rate": 4.968584297949254e-07, | |
| "loss": 0.2886, | |
| "reward": 0.4305555659035842, | |
| "reward_std": 0.5503566016753515, | |
| "rewards/equation_reward_func": 0.3194444527228673, | |
| "rewards/format_reward_func": 0.11111111442248027, | |
| "step": 474 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 356.95834096272785, | |
| "epoch": 0.9009463722397476, | |
| "grad_norm": 53.210272136279166, | |
| "kl": 67.9296875, | |
| "learning_rate": 4.968156332389489e-07, | |
| "loss": 0.2718, | |
| "reward": 0.652777798473835, | |
| "reward_std": 0.6074397390087446, | |
| "rewards/equation_reward_func": 0.44444445582727593, | |
| "rewards/format_reward_func": 0.20833333830038706, | |
| "step": 476 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 345.06250254313153, | |
| "epoch": 0.9047318611987382, | |
| "grad_norm": 68.31437143998066, | |
| "kl": 26.390625, | |
| "learning_rate": 4.967725490183929e-07, | |
| "loss": 0.2034, | |
| "reward": 0.5625000111758709, | |
| "reward_std": 0.6406622032324473, | |
| "rewards/equation_reward_func": 0.35416667970518273, | |
| "rewards/format_reward_func": 0.2083333389212688, | |
| "step": 478 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 384.29168192545575, | |
| "epoch": 0.9085173501577287, | |
| "grad_norm": 46.566871330301204, | |
| "kl": 38.3125, | |
| "learning_rate": 4.967291771834726e-07, | |
| "loss": 0.2743, | |
| "reward": 0.5138889116545519, | |
| "reward_std": 0.6012993454933167, | |
| "rewards/equation_reward_func": 0.3472222375373046, | |
| "rewards/format_reward_func": 0.1666666685293118, | |
| "step": 480 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 403.1458435058594, | |
| "epoch": 0.9123028391167193, | |
| "grad_norm": 52.966354403482825, | |
| "kl": 58.018229166666664, | |
| "learning_rate": 4.96685517784738e-07, | |
| "loss": 0.1692, | |
| "reward": 0.5555555783212185, | |
| "reward_std": 0.5279722325503826, | |
| "rewards/equation_reward_func": 0.3819444576899211, | |
| "rewards/format_reward_func": 0.17361111318071684, | |
| "step": 482 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 415.2847328186035, | |
| "epoch": 0.9160883280757098, | |
| "grad_norm": 76.22102872372605, | |
| "kl": 36.552083333333336, | |
| "learning_rate": 4.966415708730742e-07, | |
| "loss": 0.2723, | |
| "reward": 0.4930555745959282, | |
| "reward_std": 0.5246221944689751, | |
| "rewards/equation_reward_func": 0.31944445210198563, | |
| "rewards/format_reward_func": 0.173611115043362, | |
| "step": 484 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 414.2291793823242, | |
| "epoch": 0.9198738170347003, | |
| "grad_norm": 126.65545998025205, | |
| "kl": 60.572916666666664, | |
| "learning_rate": 4.965973364997015e-07, | |
| "loss": 0.2943, | |
| "reward": 0.5138889017204443, | |
| "reward_std": 0.6207031682133675, | |
| "rewards/equation_reward_func": 0.3402777835726738, | |
| "rewards/format_reward_func": 0.1736111156642437, | |
| "step": 486 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 448.9166768391927, | |
| "epoch": 0.9236593059936908, | |
| "grad_norm": 39.174744576485224, | |
| "kl": 56.713541666666664, | |
| "learning_rate": 4.965528147161752e-07, | |
| "loss": 0.2663, | |
| "reward": 0.46527779412766296, | |
| "reward_std": 0.4942639557023843, | |
| "rewards/equation_reward_func": 0.30555556776622933, | |
| "rewards/format_reward_func": 0.1597222244987885, | |
| "step": 488 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 486.6319580078125, | |
| "epoch": 0.9274447949526814, | |
| "grad_norm": 48.44066729068605, | |
| "kl": 102.69791666666667, | |
| "learning_rate": 4.965080055743858e-07, | |
| "loss": 0.2164, | |
| "reward": 0.36805556279917556, | |
| "reward_std": 0.503364427636067, | |
| "rewards/equation_reward_func": 0.22916667411724725, | |
| "rewards/format_reward_func": 0.1388888917863369, | |
| "step": 490 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 491.8541844685872, | |
| "epoch": 0.931230283911672, | |
| "grad_norm": 46.336260606492786, | |
| "kl": 81.82291666666667, | |
| "learning_rate": 4.964629091265583e-07, | |
| "loss": 0.2553, | |
| "reward": 0.36805556900799274, | |
| "reward_std": 0.39493420471747714, | |
| "rewards/equation_reward_func": 0.2430555603156487, | |
| "rewards/format_reward_func": 0.1250000031044086, | |
| "step": 492 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 470.3472315470378, | |
| "epoch": 0.9350157728706625, | |
| "grad_norm": 117.64457418851589, | |
| "kl": 107.79166666666667, | |
| "learning_rate": 4.964175254252529e-07, | |
| "loss": 0.2875, | |
| "reward": 0.29166667473812896, | |
| "reward_std": 0.40408586089809734, | |
| "rewards/equation_reward_func": 0.1944444508602222, | |
| "rewards/format_reward_func": 0.09722222449878852, | |
| "step": 494 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 440.8472391764323, | |
| "epoch": 0.938801261829653, | |
| "grad_norm": 122.35688787505664, | |
| "kl": 80.23958333333333, | |
| "learning_rate": 4.963718545233644e-07, | |
| "loss": 0.2675, | |
| "reward": 0.2916666815678279, | |
| "reward_std": 0.4292173832654953, | |
| "rewards/equation_reward_func": 0.1527777804682652, | |
| "rewards/format_reward_func": 0.1388888917863369, | |
| "step": 496 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 451.4514045715332, | |
| "epoch": 0.9425867507886435, | |
| "grad_norm": 61.21324750008328, | |
| "kl": 64.75, | |
| "learning_rate": 4.963258964741226e-07, | |
| "loss": 0.3291, | |
| "reward": 0.3819444589316845, | |
| "reward_std": 0.4863445957501729, | |
| "rewards/equation_reward_func": 0.26388889489074546, | |
| "rewards/format_reward_func": 0.11805555845300357, | |
| "step": 498 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 448.3680674235026, | |
| "epoch": 0.9463722397476341, | |
| "grad_norm": 76.04318455804953, | |
| "kl": 56.770833333333336, | |
| "learning_rate": 4.962796513310916e-07, | |
| "loss": 0.2302, | |
| "reward": 0.3333333383003871, | |
| "reward_std": 0.4893345981836319, | |
| "rewards/equation_reward_func": 0.2222222276031971, | |
| "rewards/format_reward_func": 0.11111111318071683, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 6000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 12, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |