{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9463722397476341, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 405.5555674235026, "epoch": 0.0037854889589905363, "grad_norm": 1.5204231066145135, "kl": 0.0, "learning_rate": 5.555555555555555e-09, "loss": 0.0329, "reward": 0.3750000099341075, "reward_std": 0.3891436904668808, "rewards/equation_reward_func": 0.3472222325702508, "rewards/format_reward_func": 0.027777778605620067, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 426.1597366333008, "epoch": 0.007570977917981073, "grad_norm": 1.6075594847685568, "kl": 0.00020535786946614584, "learning_rate": 1.111111111111111e-08, "loss": 0.0004, "reward": 0.35416667846341926, "reward_std": 0.40144437551498413, "rewards/equation_reward_func": 0.3333333432674408, "rewards/format_reward_func": 0.02083333395421505, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 389.59028879801434, "epoch": 0.011356466876971609, "grad_norm": 1.7738388926882676, "kl": 0.00020662943522135416, "learning_rate": 1.6666666666666667e-08, "loss": 0.0068, "reward": 0.3611111206312974, "reward_std": 0.34669753164052963, "rewards/equation_reward_func": 0.3263888942698638, "rewards/format_reward_func": 0.034722223257025085, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 407.18751271565753, "epoch": 0.015141955835962145, "grad_norm": 1.7413085729902613, "kl": 0.00020869572957356772, "learning_rate": 2.222222222222222e-08, "loss": -0.0175, "reward": 0.3750000149011612, "reward_std": 0.43933459122975665, "rewards/equation_reward_func": 0.36111112497746944, "rewards/format_reward_func": 0.013888889302810034, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 436.61806615193683, "epoch": 0.01892744479495268, "grad_norm": 1.5523678031549322, "kl": 0.0001990795135498047, "learning_rate": 2.7777777777777774e-08, "loss": 0.0009, "reward": 0.3958333482344945, "reward_std": 0.4240533635020256, "rewards/equation_reward_func": 0.3888889029622078, "rewards/format_reward_func": 0.006944444651405017, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 417.5763982137044, "epoch": 0.022712933753943218, "grad_norm": 2.1871520648907357, "kl": 0.0002460479736328125, "learning_rate": 3.3333333333333334e-08, "loss": 0.0668, "reward": 0.31944445210198563, "reward_std": 0.3596703422566255, "rewards/equation_reward_func": 0.31250000807146233, "rewards/format_reward_func": 0.006944444651405017, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 403.26390075683594, "epoch": 0.026498422712933754, "grad_norm": 1.6906264767615913, "kl": 0.00021004676818847656, "learning_rate": 3.888888888888889e-08, "loss": 0.0052, "reward": 0.3611111268401146, "reward_std": 0.42362942298253375, "rewards/equation_reward_func": 0.354166679084301, "rewards/format_reward_func": 0.006944444651405017, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 393.6944580078125, "epoch": 0.03028391167192429, "grad_norm": 1.8134093955469572, "kl": 0.0002319812774658203, "learning_rate": 4.444444444444444e-08, "loss": 0.0291, "reward": 0.4097222362955411, "reward_std": 0.43579815079768497, "rewards/equation_reward_func": 0.3888889004786809, "rewards/format_reward_func": 0.02083333395421505, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 440.2708460489909, "epoch": 0.03406940063091483, "grad_norm": 1.4606786067632986, "kl": 0.0002152125040690104, "learning_rate": 5e-08, "loss": 0.0191, "reward": 0.3888889004786809, "reward_std": 0.44846897075573605, "rewards/equation_reward_func": 0.3680555671453476, "rewards/format_reward_func": 0.02083333395421505, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 411.56251525878906, "epoch": 0.03785488958990536, "grad_norm": 8.74750891658874, "kl": 0.00022975603739420572, "learning_rate": 5.555555555555555e-08, "loss": 0.0165, "reward": 0.38888889861603576, "reward_std": 0.3779858859876792, "rewards/equation_reward_func": 0.38888889861603576, "rewards/format_reward_func": 0.0, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 410.5694580078125, "epoch": 0.0416403785488959, "grad_norm": 2.0666026097367185, "kl": 0.0002140204111735026, "learning_rate": 6.111111111111111e-08, "loss": 0.0489, "reward": 0.4305555671453476, "reward_std": 0.4184086322784424, "rewards/equation_reward_func": 0.40277779164413613, "rewards/format_reward_func": 0.027777778605620067, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 398.5972366333008, "epoch": 0.045425867507886436, "grad_norm": 1.4353693037214081, "kl": 0.00022824605305989584, "learning_rate": 6.666666666666667e-08, "loss": 0.0561, "reward": 0.39583334140479565, "reward_std": 0.38249212006727856, "rewards/equation_reward_func": 0.37500000807146233, "rewards/format_reward_func": 0.02083333395421505, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 406.06251525878906, "epoch": 0.04921135646687697, "grad_norm": 2.0297758030760487, "kl": 0.00023778279622395834, "learning_rate": 7.222222222222221e-08, "loss": -0.036, "reward": 0.2847222263614337, "reward_std": 0.35836515327294666, "rewards/equation_reward_func": 0.2638888942698638, "rewards/format_reward_func": 0.02083333395421505, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 404.18751271565753, "epoch": 0.05299684542586751, "grad_norm": 1.7807227026542323, "kl": 0.0002464453379313151, "learning_rate": 7.777777777777778e-08, "loss": -0.0037, "reward": 0.3819444552063942, "reward_std": 0.3984878833095233, "rewards/equation_reward_func": 0.37500001055498916, "rewards/format_reward_func": 0.006944444651405017, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 443.22918192545575, "epoch": 0.056782334384858045, "grad_norm": 1.4639198723709446, "kl": 0.0002082983652750651, "learning_rate": 8.333333333333333e-08, "loss": 0.0215, "reward": 0.28472222946584225, "reward_std": 0.35284433389703435, "rewards/equation_reward_func": 0.26388889613250893, "rewards/format_reward_func": 0.02083333395421505, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 417.1458460489909, "epoch": 0.06056782334384858, "grad_norm": 1.551783394227111, "kl": 0.0002196629842122396, "learning_rate": 8.888888888888888e-08, "loss": -0.0381, "reward": 0.4236111231148243, "reward_std": 0.4627470038831234, "rewards/equation_reward_func": 0.409722230086724, "rewards/format_reward_func": 0.013888889302810034, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 425.0972366333008, "epoch": 0.06435331230283911, "grad_norm": 1.6519839518228945, "kl": 0.0002177556355794271, "learning_rate": 9.444444444444444e-08, "loss": 0.0149, "reward": 0.28472222946584225, "reward_std": 0.36097555483380955, "rewards/equation_reward_func": 0.26388889489074546, "rewards/format_reward_func": 0.02083333395421505, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 416.1597315470378, "epoch": 0.06813880126182965, "grad_norm": 1.6298419922409495, "kl": 0.00024358431498209635, "learning_rate": 1e-07, "loss": 0.0544, "reward": 0.31250000931322575, "reward_std": 0.406619085619847, "rewards/equation_reward_func": 0.27777778667708236, "rewards/format_reward_func": 0.034722223257025085, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 406.13195546468097, "epoch": 0.07192429022082018, "grad_norm": 1.7781933588930947, "kl": 0.00020241737365722656, "learning_rate": 1.0555555555555555e-07, "loss": 0.0181, "reward": 0.5208333432674408, "reward_std": 0.48631447553634644, "rewards/equation_reward_func": 0.5000000074505806, "rewards/format_reward_func": 0.02083333395421505, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 442.87500890096027, "epoch": 0.07570977917981073, "grad_norm": 1.7856778327927993, "kl": 0.00023746490478515625, "learning_rate": 1.111111111111111e-07, "loss": -0.0027, "reward": 0.32638889613250893, "reward_std": 0.37259839847683907, "rewards/equation_reward_func": 0.31250000807146233, "rewards/format_reward_func": 0.013888889302810034, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 375.65973409016925, "epoch": 0.07949526813880126, "grad_norm": 1.9930393569793248, "kl": 0.00021648406982421875, "learning_rate": 1.1666666666666667e-07, "loss": 0.0641, "reward": 0.4236111268401146, "reward_std": 0.38463745390375453, "rewards/equation_reward_func": 0.4027777910232544, "rewards/format_reward_func": 0.02083333395421505, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 411.6736195882161, "epoch": 0.0832807570977918, "grad_norm": 1.4888095636144503, "kl": 0.0002304712931315104, "learning_rate": 1.2222222222222222e-07, "loss": 0.0313, "reward": 0.31944445210198563, "reward_std": 0.3178868380685647, "rewards/equation_reward_func": 0.28472223194936913, "rewards/format_reward_func": 0.034722223257025085, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 389.4166793823242, "epoch": 0.08706624605678233, "grad_norm": 1.6283738307368585, "kl": 0.00023396809895833334, "learning_rate": 1.2777777777777777e-07, "loss": 0.0686, "reward": 0.2986111169060071, "reward_std": 0.37988172471523285, "rewards/equation_reward_func": 0.2847222288449605, "rewards/format_reward_func": 0.013888889302810034, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 405.71528879801434, "epoch": 0.09085173501577287, "grad_norm": 12.938622660748152, "kl": 0.00023698806762695312, "learning_rate": 1.3333333333333334e-07, "loss": -0.0074, "reward": 0.2361111187686523, "reward_std": 0.3309923857450485, "rewards/equation_reward_func": 0.22222222574055195, "rewards/format_reward_func": 0.013888889302810034, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 434.0694580078125, "epoch": 0.0946372239747634, "grad_norm": 3.061348126208135, "kl": 0.00024008750915527344, "learning_rate": 1.3888888888888888e-07, "loss": -0.012, "reward": 0.27083334264655906, "reward_std": 0.34488533437252045, "rewards/equation_reward_func": 0.25694445086022216, "rewards/format_reward_func": 0.013888889302810034, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 413.0694529215495, "epoch": 0.09842271293375394, "grad_norm": 3.454024027390986, "kl": 0.0003235340118408203, "learning_rate": 1.4444444444444442e-07, "loss": -0.0064, "reward": 0.40972223194936913, "reward_std": 0.3772713306049506, "rewards/equation_reward_func": 0.40277778916060925, "rewards/format_reward_func": 0.006944444651405017, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 446.2986195882161, "epoch": 0.10220820189274447, "grad_norm": 1.6865767812775654, "kl": 0.00020933151245117188, "learning_rate": 1.5e-07, "loss": 0.0067, "reward": 0.3750000136593978, "reward_std": 0.36897342403729755, "rewards/equation_reward_func": 0.3611111231148243, "rewards/format_reward_func": 0.013888889302810034, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 441.1736246744792, "epoch": 0.10599369085173502, "grad_norm": 2.2986869828700334, "kl": 0.0004001458485921224, "learning_rate": 1.5555555555555556e-07, "loss": 0.0206, "reward": 0.3541666716337204, "reward_std": 0.3243444561958313, "rewards/equation_reward_func": 0.3472222263614337, "rewards/format_reward_func": 0.006944444651405017, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 432.27085240681964, "epoch": 0.10977917981072555, "grad_norm": 2.1906732682758645, "kl": 0.0002334117889404297, "learning_rate": 1.611111111111111e-07, "loss": 0.0075, "reward": 0.40972223194936913, "reward_std": 0.4255252617100875, "rewards/equation_reward_func": 0.39583334513008595, "rewards/format_reward_func": 0.013888889302810034, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 426.7569529215495, "epoch": 0.11356466876971609, "grad_norm": 2.041653144281195, "kl": 0.0002582073211669922, "learning_rate": 1.6666666666666665e-07, "loss": 0.0211, "reward": 0.3680555634200573, "reward_std": 0.40922948469718295, "rewards/equation_reward_func": 0.361111119389534, "rewards/format_reward_func": 0.006944444651405017, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 400.68751271565753, "epoch": 0.11735015772870662, "grad_norm": 1.93215349954409, "kl": 0.0002829233805338542, "learning_rate": 1.7222222222222222e-07, "loss": -0.033, "reward": 0.4097222400208314, "reward_std": 0.45954596251249313, "rewards/equation_reward_func": 0.3888889004786809, "rewards/format_reward_func": 0.02083333395421505, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 429.3125178019206, "epoch": 0.12113564668769716, "grad_norm": 1.8873459369947734, "kl": 0.0002741813659667969, "learning_rate": 1.7777777777777776e-07, "loss": 0.0397, "reward": 0.4305555708706379, "reward_std": 0.41432634244362515, "rewards/equation_reward_func": 0.4027777848144372, "rewards/format_reward_func": 0.027777778605620067, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 421.88890329996747, "epoch": 0.12492113564668769, "grad_norm": 1.688287521815126, "kl": 0.00026599566141764325, "learning_rate": 1.833333333333333e-07, "loss": 0.0008, "reward": 0.3472222276031971, "reward_std": 0.3376887192328771, "rewards/equation_reward_func": 0.3402777823309104, "rewards/format_reward_func": 0.006944444651405017, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 452.2708447774251, "epoch": 0.12870662460567822, "grad_norm": 1.37663800323155, "kl": 0.0003294944763183594, "learning_rate": 1.8888888888888888e-07, "loss": 0.0556, "reward": 0.3263888992369175, "reward_std": 0.2874133574465911, "rewards/equation_reward_func": 0.312500008692344, "rewards/format_reward_func": 0.013888889302810034, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 436.71528879801434, "epoch": 0.13249211356466878, "grad_norm": 1.4483289492436444, "kl": 0.00029428799947102863, "learning_rate": 1.9444444444444445e-07, "loss": 0.0386, "reward": 0.29861111876865226, "reward_std": 0.3121309739847978, "rewards/equation_reward_func": 0.28472222822407883, "rewards/format_reward_func": 0.013888889302810034, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 414.62501271565753, "epoch": 0.1362776025236593, "grad_norm": 1.4269628803342047, "kl": 0.0002837181091308594, "learning_rate": 2e-07, "loss": 0.0402, "reward": 0.32638889861603576, "reward_std": 0.35836514706412953, "rewards/equation_reward_func": 0.29861111752688885, "rewards/format_reward_func": 0.027777778605620067, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 398.69445546468097, "epoch": 0.14006309148264984, "grad_norm": 1.3415769326825684, "kl": 0.00044043858846028644, "learning_rate": 2.0555555555555553e-07, "loss": -0.0574, "reward": 0.3333333420256774, "reward_std": 0.33815376708904904, "rewards/equation_reward_func": 0.31250000682969886, "rewards/format_reward_func": 0.02083333395421505, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 397.2708460489909, "epoch": 0.14384858044164037, "grad_norm": 1.6736466199506606, "kl": 0.0003532568613688151, "learning_rate": 2.111111111111111e-07, "loss": 0.0355, "reward": 0.3958333519597848, "reward_std": 0.3349916177491347, "rewards/equation_reward_func": 0.3750000173846881, "rewards/format_reward_func": 0.02083333395421505, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 389.6388982137044, "epoch": 0.14763406940063092, "grad_norm": 1.6778745969677393, "kl": 0.0004076957702636719, "learning_rate": 2.1666666666666667e-07, "loss": -0.0089, "reward": 0.3819444514811039, "reward_std": 0.3766806833446026, "rewards/equation_reward_func": 0.3680555646618207, "rewards/format_reward_func": 0.013888889302810034, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 369.4305648803711, "epoch": 0.15141955835962145, "grad_norm": 1.7763243336052263, "kl": 0.0004963874816894531, "learning_rate": 2.222222222222222e-07, "loss": 0.0596, "reward": 0.3541666716337204, "reward_std": 0.4322179580728213, "rewards/equation_reward_func": 0.3125000049670537, "rewards/format_reward_func": 0.0416666679084301, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 418.77778879801434, "epoch": 0.15520504731861198, "grad_norm": 2.4954676920223084, "kl": 0.0005669593811035156, "learning_rate": 2.2777777777777776e-07, "loss": 0.0353, "reward": 0.4583333469927311, "reward_std": 0.4091739282011986, "rewards/equation_reward_func": 0.4305555659035842, "rewards/format_reward_func": 0.027777778605620067, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 389.9583460489909, "epoch": 0.1589905362776025, "grad_norm": 1.9030809806319569, "kl": 0.0004928906758626302, "learning_rate": 2.3333333333333333e-07, "loss": 0.047, "reward": 0.4236111255983512, "reward_std": 0.4178568907082081, "rewards/equation_reward_func": 0.4097222325702508, "rewards/format_reward_func": 0.013888889302810034, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 442.4930648803711, "epoch": 0.16277602523659307, "grad_norm": 1.3770817743623749, "kl": 0.0006133715311686198, "learning_rate": 2.388888888888889e-07, "loss": 0.0004, "reward": 0.4166666753590107, "reward_std": 0.37612894798318547, "rewards/equation_reward_func": 0.3958333469927311, "rewards/format_reward_func": 0.02083333395421505, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 443.9444529215495, "epoch": 0.1665615141955836, "grad_norm": 1.4650231298226628, "kl": 0.0006745656331380209, "learning_rate": 2.4444444444444445e-07, "loss": -0.0017, "reward": 0.3750000111758709, "reward_std": 0.3954201638698578, "rewards/equation_reward_func": 0.36805556652446586, "rewards/format_reward_func": 0.006944444651405017, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 429.5069630940755, "epoch": 0.17034700315457413, "grad_norm": 1.9750543844698667, "kl": 0.000976403554280599, "learning_rate": 2.5e-07, "loss": 0.0418, "reward": 0.36805556528270245, "reward_std": 0.4254308380186558, "rewards/equation_reward_func": 0.34027778543531895, "rewards/format_reward_func": 0.027777778605620067, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 414.8263982137044, "epoch": 0.17413249211356466, "grad_norm": 3.4819218815816417, "kl": 0.000812689463297526, "learning_rate": 2.5555555555555553e-07, "loss": -0.073, "reward": 0.4166666766007741, "reward_std": 0.3864077205459277, "rewards/equation_reward_func": 0.4097222313284874, "rewards/format_reward_func": 0.006944444651405017, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 358.4236195882161, "epoch": 0.17791798107255521, "grad_norm": 3.17839003794858, "kl": 0.0010786056518554688, "learning_rate": 2.6111111111111113e-07, "loss": 0.0271, "reward": 0.4305555634200573, "reward_std": 0.4322568451364835, "rewards/equation_reward_func": 0.4027777872979641, "rewards/format_reward_func": 0.027777778605620067, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 421.7361208597819, "epoch": 0.18170347003154574, "grad_norm": 1.581354739245434, "kl": 0.0016581217447916667, "learning_rate": 2.6666666666666667e-07, "loss": -0.0128, "reward": 0.40972223194936913, "reward_std": 0.4322179468969504, "rewards/equation_reward_func": 0.40277778667708236, "rewards/format_reward_func": 0.006944444651405017, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 378.01390075683594, "epoch": 0.18548895899053627, "grad_norm": 1.6207630586247244, "kl": 0.0008861223856608073, "learning_rate": 2.7222222222222216e-07, "loss": -0.0275, "reward": 0.48611112001041573, "reward_std": 0.38690390810370445, "rewards/equation_reward_func": 0.4513888992369175, "rewards/format_reward_func": 0.034722223257025085, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 410.0486208597819, "epoch": 0.1892744479495268, "grad_norm": 1.6017744101080356, "kl": 0.0018717447916666667, "learning_rate": 2.7777777777777776e-07, "loss": 0.0666, "reward": 0.4166666778425376, "reward_std": 0.4304381770392259, "rewards/equation_reward_func": 0.38888889489074546, "rewards/format_reward_func": 0.027777778605620067, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 417.06945546468097, "epoch": 0.19305993690851736, "grad_norm": 1.781222955044469, "kl": 0.0016377766927083333, "learning_rate": 2.833333333333333e-07, "loss": 0.01, "reward": 0.3750000037252903, "reward_std": 0.4017697374025981, "rewards/equation_reward_func": 0.3472222263614337, "rewards/format_reward_func": 0.027777778605620067, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 394.3333460489909, "epoch": 0.1968454258675079, "grad_norm": 2.071755215843617, "kl": 0.0020945866902669272, "learning_rate": 2.8888888888888885e-07, "loss": 0.0326, "reward": 0.46527778543531895, "reward_std": 0.4230251908302307, "rewards/equation_reward_func": 0.42361111876865226, "rewards/format_reward_func": 0.0416666679084301, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 381.5902913411458, "epoch": 0.20063091482649842, "grad_norm": 4.141898501255106, "kl": 0.002117792765299479, "learning_rate": 2.9444444444444444e-07, "loss": 0.0573, "reward": 0.3819444632778565, "reward_std": 0.35283846283952397, "rewards/equation_reward_func": 0.36111112249394256, "rewards/format_reward_func": 0.02083333395421505, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 427.81251271565753, "epoch": 0.20441640378548895, "grad_norm": 1.8973650475253758, "kl": 0.0040442148844401045, "learning_rate": 3e-07, "loss": 0.0408, "reward": 0.40277778543531895, "reward_std": 0.3682141068081061, "rewards/equation_reward_func": 0.38194445582727593, "rewards/format_reward_func": 0.02083333395421505, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 419.87501017252606, "epoch": 0.2082018927444795, "grad_norm": 1.7756216737476342, "kl": 0.0018666585286458333, "learning_rate": 3.055555555555556e-07, "loss": -0.0149, "reward": 0.4722222350537777, "reward_std": 0.4178180123368899, "rewards/equation_reward_func": 0.45833334513008595, "rewards/format_reward_func": 0.013888889302810034, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 418.2222315470378, "epoch": 0.21198738170347003, "grad_norm": 1.5674546331115733, "kl": 0.0031108856201171875, "learning_rate": 3.111111111111111e-07, "loss": -0.0077, "reward": 0.36805556279917556, "reward_std": 0.36998799939950305, "rewards/equation_reward_func": 0.34027778543531895, "rewards/format_reward_func": 0.027777778605620067, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 365.8333460489909, "epoch": 0.21577287066246056, "grad_norm": 2.163488952988276, "kl": 0.003872553507486979, "learning_rate": 3.166666666666666e-07, "loss": 0.0248, "reward": 0.44444446079432964, "reward_std": 0.44138550013303757, "rewards/equation_reward_func": 0.41666667722165585, "rewards/format_reward_func": 0.027777778605620067, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 377.95140329996747, "epoch": 0.2195583596214511, "grad_norm": 1.9890628606938958, "kl": 0.0058383941650390625, "learning_rate": 3.222222222222222e-07, "loss": 0.0108, "reward": 0.44444446203609306, "reward_std": 0.4227793253958225, "rewards/equation_reward_func": 0.4097222338120143, "rewards/format_reward_func": 0.034722223257025085, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 390.0833485921224, "epoch": 0.22334384858044165, "grad_norm": 1.6950159200633848, "kl": 0.0033391316731770835, "learning_rate": 3.2777777777777776e-07, "loss": 0.0218, "reward": 0.5000000149011612, "reward_std": 0.46135225395361584, "rewards/equation_reward_func": 0.472222238779068, "rewards/format_reward_func": 0.027777778605620067, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 434.47223409016925, "epoch": 0.22712933753943218, "grad_norm": 1.6912972588519333, "kl": 0.0037129720052083335, "learning_rate": 3.333333333333333e-07, "loss": 0.0303, "reward": 0.402777789781491, "reward_std": 0.37431980296969414, "rewards/equation_reward_func": 0.3888888992369175, "rewards/format_reward_func": 0.013888889302810034, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 379.2916768391927, "epoch": 0.2309148264984227, "grad_norm": 1.7665244345202618, "kl": 0.010921478271484375, "learning_rate": 3.388888888888889e-07, "loss": 0.0464, "reward": 0.38194445086022216, "reward_std": 0.3815583561857541, "rewards/equation_reward_func": 0.3402777835726738, "rewards/format_reward_func": 0.041666667287548385, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 346.40973409016925, "epoch": 0.23470031545741324, "grad_norm": 1.7777246622024319, "kl": 0.0057525634765625, "learning_rate": 3.4444444444444444e-07, "loss": 0.073, "reward": 0.5555555739750465, "reward_std": 0.5072049958010515, "rewards/equation_reward_func": 0.5138889104127884, "rewards/format_reward_func": 0.0416666679084301, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 394.50001525878906, "epoch": 0.2384858044164038, "grad_norm": 1.2724495191880816, "kl": 0.005407969156901042, "learning_rate": 3.5e-07, "loss": 0.0085, "reward": 0.5416666809469461, "reward_std": 0.4325893906255563, "rewards/equation_reward_func": 0.5138889035830895, "rewards/format_reward_func": 0.027777778605620067, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 426.51390584309894, "epoch": 0.24227129337539433, "grad_norm": 1.7084837781655138, "kl": 0.013666788736979166, "learning_rate": 3.5555555555555553e-07, "loss": -0.0031, "reward": 0.40277779288589954, "reward_std": 0.4025290633241336, "rewards/equation_reward_func": 0.36805557149151963, "rewards/format_reward_func": 0.034722223257025085, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 393.2986195882161, "epoch": 0.24605678233438485, "grad_norm": 1.6652131128084247, "kl": 0.009862263997395834, "learning_rate": 3.6111111111111107e-07, "loss": 0.076, "reward": 0.5277777922650179, "reward_std": 0.4298570702473323, "rewards/equation_reward_func": 0.5000000111758709, "rewards/format_reward_func": 0.027777778605620067, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 387.29862213134766, "epoch": 0.24984227129337538, "grad_norm": 1.4235829543070357, "kl": 0.006196339925130208, "learning_rate": 3.666666666666666e-07, "loss": 0.0928, "reward": 0.5763888955116272, "reward_std": 0.4299643337726593, "rewards/equation_reward_func": 0.5277777959903082, "rewards/format_reward_func": 0.048611112559835114, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 412.95140329996747, "epoch": 0.25362776025236594, "grad_norm": 1.5899891064412919, "kl": 0.010592142740885416, "learning_rate": 3.722222222222222e-07, "loss": -0.0172, "reward": 0.46527778543531895, "reward_std": 0.4477427862584591, "rewards/equation_reward_func": 0.39583334264655906, "rewards/format_reward_func": 0.06944444589316845, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 375.7916768391927, "epoch": 0.25741324921135644, "grad_norm": 1.8581102947843942, "kl": 0.0097503662109375, "learning_rate": 3.7777777777777775e-07, "loss": 0.0656, "reward": 0.3819444539646308, "reward_std": 0.4159533294538657, "rewards/equation_reward_func": 0.3263888979951541, "rewards/format_reward_func": 0.055555557211240135, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 402.8333460489909, "epoch": 0.261198738170347, "grad_norm": 1.8615199132284905, "kl": 0.00942230224609375, "learning_rate": 3.8333333333333335e-07, "loss": 0.1235, "reward": 0.3958333407839139, "reward_std": 0.4607119709253311, "rewards/equation_reward_func": 0.33333334637184936, "rewards/format_reward_func": 0.06250000186264515, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 388.50695037841797, "epoch": 0.26498422712933756, "grad_norm": 1.4132445755845657, "kl": 0.031420389811197914, "learning_rate": 3.888888888888889e-07, "loss": -0.0025, "reward": 0.6388889079292616, "reward_std": 0.4517383811374505, "rewards/equation_reward_func": 0.5972222362955412, "rewards/format_reward_func": 0.0416666679084301, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 399.69445546468097, "epoch": 0.26876971608832806, "grad_norm": 1.6231049342643675, "kl": 0.011019388834635416, "learning_rate": 3.9444444444444444e-07, "loss": -0.0123, "reward": 0.5486111268401146, "reward_std": 0.4811764856179555, "rewards/equation_reward_func": 0.486111119389534, "rewards/format_reward_func": 0.06250000186264515, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 386.03473409016925, "epoch": 0.2725552050473186, "grad_norm": 1.5773905454247337, "kl": 0.04953765869140625, "learning_rate": 4e-07, "loss": 0.0301, "reward": 0.5486111262192329, "reward_std": 0.44820784653226536, "rewards/equation_reward_func": 0.48611111876865226, "rewards/format_reward_func": 0.06250000124176343, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 398.125005086263, "epoch": 0.2763406940063092, "grad_norm": 1.2456280183136625, "kl": 0.025739034016927082, "learning_rate": 4.055555555555555e-07, "loss": 0.0387, "reward": 0.5000000186264515, "reward_std": 0.321004219353199, "rewards/equation_reward_func": 0.4444444552063942, "rewards/format_reward_func": 0.055555557211240135, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 366.2986195882161, "epoch": 0.2801261829652997, "grad_norm": 1.5044443965642595, "kl": 0.021631876627604168, "learning_rate": 4.1111111111111107e-07, "loss": 0.0575, "reward": 0.43055557273328304, "reward_std": 0.34387076273560524, "rewards/equation_reward_func": 0.38888889613250893, "rewards/format_reward_func": 0.0416666679084301, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 372.86112721761066, "epoch": 0.28391167192429023, "grad_norm": 2.033550282447034, "kl": 0.02593231201171875, "learning_rate": 4.1666666666666667e-07, "loss": 0.1121, "reward": 0.6805555721124014, "reward_std": 0.5391590123375257, "rewards/equation_reward_func": 0.5486111287027597, "rewards/format_reward_func": 0.1319444477558136, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 340.0138969421387, "epoch": 0.28769716088328073, "grad_norm": 2.2987693590791363, "kl": 0.0315399169921875, "learning_rate": 4.222222222222222e-07, "loss": 0.0078, "reward": 0.5694444663822651, "reward_std": 0.4559611765046914, "rewards/equation_reward_func": 0.45833334761361283, "rewards/format_reward_func": 0.11111111318071683, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 378.94445546468097, "epoch": 0.2914826498422713, "grad_norm": 1.9562773273854706, "kl": 0.09186299641927083, "learning_rate": 4.2777777777777775e-07, "loss": 0.0201, "reward": 0.6041666741172472, "reward_std": 0.49037906900048256, "rewards/equation_reward_func": 0.500000017384688, "rewards/format_reward_func": 0.10416666915019353, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 360.2152913411458, "epoch": 0.29526813880126185, "grad_norm": 1.8560446617911341, "kl": 0.060872395833333336, "learning_rate": 4.3333333333333335e-07, "loss": 0.0208, "reward": 0.5208333457509676, "reward_std": 0.4362143650650978, "rewards/equation_reward_func": 0.4444444564481576, "rewards/format_reward_func": 0.07638888992369175, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 397.4930725097656, "epoch": 0.29905362776025235, "grad_norm": 1.7722682766121323, "kl": 0.03704833984375, "learning_rate": 4.3888888888888884e-07, "loss": 0.087, "reward": 0.6736111355324587, "reward_std": 0.4783005639910698, "rewards/equation_reward_func": 0.5277777922650179, "rewards/format_reward_func": 0.14583333705862364, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 340.4583485921224, "epoch": 0.3028391167192429, "grad_norm": 1.8020563834520036, "kl": 0.052164713541666664, "learning_rate": 4.444444444444444e-07, "loss": 0.0567, "reward": 0.4791666753590107, "reward_std": 0.42059509828686714, "rewards/equation_reward_func": 0.4027777860562007, "rewards/format_reward_func": 0.07638888992369175, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 331.2986234029134, "epoch": 0.30662460567823346, "grad_norm": 1.816923697504477, "kl": 0.14789835611979166, "learning_rate": 4.5e-07, "loss": 0.0408, "reward": 0.6319444521019856, "reward_std": 0.40209560344616574, "rewards/equation_reward_func": 0.5208333420256773, "rewards/format_reward_func": 0.11111111318071683, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 374.74306615193683, "epoch": 0.31041009463722397, "grad_norm": 1.6763316060977995, "kl": 0.04315185546875, "learning_rate": 4.555555555555555e-07, "loss": 0.0657, "reward": 0.652777798473835, "reward_std": 0.47464097539583844, "rewards/equation_reward_func": 0.4722222313284874, "rewards/format_reward_func": 0.180555559694767, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 377.5416768391927, "epoch": 0.3141955835962145, "grad_norm": 1.6769854770195711, "kl": 0.069580078125, "learning_rate": 4.611111111111111e-07, "loss": 0.0887, "reward": 0.6527777897814909, "reward_std": 0.5109836533665657, "rewards/equation_reward_func": 0.4722222400208314, "rewards/format_reward_func": 0.180555559694767, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 355.3680648803711, "epoch": 0.317981072555205, "grad_norm": 1.5011515214179916, "kl": 0.19896443684895834, "learning_rate": 4.6666666666666666e-07, "loss": 0.0834, "reward": 0.7638889228304228, "reward_std": 0.5609942426284155, "rewards/equation_reward_func": 0.569444460173448, "rewards/format_reward_func": 0.1944444483766953, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 353.7361195882161, "epoch": 0.3217665615141956, "grad_norm": 15.47233883807027, "kl": 0.056732177734375, "learning_rate": 4.722222222222222e-07, "loss": 0.0426, "reward": 0.7500000124176344, "reward_std": 0.5186516791582108, "rewards/equation_reward_func": 0.5000000161429247, "rewards/format_reward_func": 0.25000000931322575, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 308.31251017252606, "epoch": 0.32555205047318614, "grad_norm": 2.1360230493136325, "kl": 0.19114176432291666, "learning_rate": 4.777777777777778e-07, "loss": 0.1307, "reward": 0.8055555882553259, "reward_std": 0.5721215779582659, "rewards/equation_reward_func": 0.5000000086923441, "rewards/format_reward_func": 0.30555556404093903, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 314.75000762939453, "epoch": 0.32933753943217664, "grad_norm": 2.7019951929627415, "kl": 0.1749267578125, "learning_rate": 4.833333333333333e-07, "loss": 0.0575, "reward": 0.826388897995154, "reward_std": 0.5692646453777949, "rewards/equation_reward_func": 0.5138889017204443, "rewards/format_reward_func": 0.312500008692344, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 297.9722328186035, "epoch": 0.3331230283911672, "grad_norm": 2.3496202759776788, "kl": 0.3324991861979167, "learning_rate": 4.888888888888889e-07, "loss": 0.0734, "reward": 0.868055577079455, "reward_std": 0.6303805137674013, "rewards/equation_reward_func": 0.486111128081878, "rewards/format_reward_func": 0.3819444564481576, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 349.2708384195964, "epoch": 0.33690851735015775, "grad_norm": 7.352824169943818, "kl": 0.4977823893229167, "learning_rate": 4.944444444444445e-07, "loss": 0.0453, "reward": 0.770833362514774, "reward_std": 0.6322847319145998, "rewards/equation_reward_func": 0.3958333370586236, "rewards/format_reward_func": 0.37500001179675263, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 297.1666781107585, "epoch": 0.34069400630914826, "grad_norm": 24.940449042597443, "kl": 4.795857747395833, "learning_rate": 5e-07, "loss": 0.0703, "reward": 1.0555555820465088, "reward_std": 0.5782317991058031, "rewards/equation_reward_func": 0.5555555745959282, "rewards/format_reward_func": 0.5000000211099783, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 271.9027849833171, "epoch": 0.3444794952681388, "grad_norm": 2.5871456184851414, "kl": 14.199259440104166, "learning_rate": 4.999998543120144e-07, "loss": 0.0763, "reward": 1.0902778108914692, "reward_std": 0.5762393027544022, "rewards/equation_reward_func": 0.5694444589316845, "rewards/format_reward_func": 0.5208333482344946, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 309.31250890096027, "epoch": 0.3482649842271293, "grad_norm": 2.0327389699765313, "kl": 0.7781168619791666, "learning_rate": 4.999994172482276e-07, "loss": 0.1347, "reward": 0.895833358168602, "reward_std": 0.5533264875411987, "rewards/equation_reward_func": 0.43750001055498916, "rewards/format_reward_func": 0.4583333469927311, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 268.13889567057294, "epoch": 0.35205047318611987, "grad_norm": 15.941396290884985, "kl": 4.468831380208333, "learning_rate": 4.99998688809149e-07, "loss": 0.0794, "reward": 0.979166696468989, "reward_std": 0.5592605446775755, "rewards/equation_reward_func": 0.38194445582727593, "rewards/format_reward_func": 0.5972222338120142, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 247.0416742960612, "epoch": 0.35583596214511043, "grad_norm": 2.4432100547197657, "kl": 0.603515625, "learning_rate": 4.999976689956274e-07, "loss": 0.023, "reward": 1.1041666915019352, "reward_std": 0.5778869986534119, "rewards/equation_reward_func": 0.5138888973742723, "rewards/format_reward_func": 0.5902777935067812, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 248.41667302449545, "epoch": 0.35962145110410093, "grad_norm": 4.823568956607298, "kl": 1.6413167317708333, "learning_rate": 4.999963578088516e-07, "loss": 0.0856, "reward": 1.0694444874922435, "reward_std": 0.719012883802255, "rewards/equation_reward_func": 0.5069444564481577, "rewards/format_reward_func": 0.562500017384688, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 273.97917556762695, "epoch": 0.3634069400630915, "grad_norm": 2.743096965012267, "kl": 0.9237467447916666, "learning_rate": 4.999947552503497e-07, "loss": 0.1483, "reward": 1.1319444874922435, "reward_std": 0.6314157545566559, "rewards/equation_reward_func": 0.5208333457509676, "rewards/format_reward_func": 0.6111111268401146, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 246.41667556762695, "epoch": 0.36719242902208205, "grad_norm": 2.638981910331043, "kl": 0.8427327473958334, "learning_rate": 4.999928613219894e-07, "loss": 0.1078, "reward": 1.0625000471870105, "reward_std": 0.6069262598951658, "rewards/equation_reward_func": 0.4583333469927311, "rewards/format_reward_func": 0.6041666877766451, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 273.4652849833171, "epoch": 0.37097791798107255, "grad_norm": 3.1586256673049946, "kl": 0.6038411458333334, "learning_rate": 4.999906760259783e-07, "loss": 0.0848, "reward": 1.1944444874922435, "reward_std": 0.5770174351831278, "rewards/equation_reward_func": 0.548611123735706, "rewards/format_reward_func": 0.6458333482344946, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 279.87500762939453, "epoch": 0.3747634069400631, "grad_norm": 2.3306411923794284, "kl": 0.4184977213541667, "learning_rate": 4.999881993648632e-07, "loss": 0.1264, "reward": 1.1805555820465088, "reward_std": 0.5819496115048727, "rewards/equation_reward_func": 0.506944460173448, "rewards/format_reward_func": 0.6736111318071684, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 285.9930610656738, "epoch": 0.3785488958990536, "grad_norm": 2.9295320476594964, "kl": 0.7996622721354166, "learning_rate": 4.999854313415308e-07, "loss": 0.1193, "reward": 1.1388889302810032, "reward_std": 0.5301796098550161, "rewards/equation_reward_func": 0.39583334264655906, "rewards/format_reward_func": 0.7430555721124014, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 269.12500890096027, "epoch": 0.38233438485804416, "grad_norm": 2.6552410076798028, "kl": 1.19091796875, "learning_rate": 4.999823719592071e-07, "loss": 0.216, "reward": 1.2777778208255768, "reward_std": 0.5021173569063345, "rewards/equation_reward_func": 0.506944457689921, "rewards/format_reward_func": 0.7708333532015482, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 274.5069516499837, "epoch": 0.3861198738170347, "grad_norm": 5.1879891100690285, "kl": 2.9518229166666665, "learning_rate": 4.999790212214579e-07, "loss": 0.1756, "reward": 1.2430555870135624, "reward_std": 0.58441444983085, "rewards/equation_reward_func": 0.479166679084301, "rewards/format_reward_func": 0.7638889054457346, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 248.79167302449545, "epoch": 0.3899053627760252, "grad_norm": 3.1959715484572917, "kl": 1.0735677083333333, "learning_rate": 4.999753791321885e-07, "loss": 0.1732, "reward": 1.3750000496705372, "reward_std": 0.5170091787974039, "rewards/equation_reward_func": 0.5902777947485447, "rewards/format_reward_func": 0.7847222487131754, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 244.10417302449545, "epoch": 0.3936908517350158, "grad_norm": 14.582535270082563, "kl": 6.297200520833333, "learning_rate": 4.999714456956438e-07, "loss": 0.0727, "reward": 1.2986111442248027, "reward_std": 0.5151846868296465, "rewards/equation_reward_func": 0.5069444552063942, "rewards/format_reward_func": 0.7916666815678278, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 242.52778244018555, "epoch": 0.39747634069400634, "grad_norm": 151.19489080244557, "kl": 27.640625, "learning_rate": 4.99967220916408e-07, "loss": 0.0915, "reward": 1.3958333532015483, "reward_std": 0.48437386751174927, "rewards/equation_reward_func": 0.5486111175268888, "rewards/format_reward_func": 0.8472222437461218, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 185.7916717529297, "epoch": 0.40126182965299684, "grad_norm": 37.78796510721226, "kl": 9.738444010416666, "learning_rate": 4.999627047994053e-07, "loss": 0.0349, "reward": 1.4375000596046448, "reward_std": 0.48517493655284244, "rewards/equation_reward_func": 0.5763889048248529, "rewards/format_reward_func": 0.8611111342906952, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 248.83333841959634, "epoch": 0.4050473186119874, "grad_norm": 15.557449401743996, "kl": 1.7869466145833333, "learning_rate": 4.999578973498994e-07, "loss": 0.0905, "reward": 1.2916667064030964, "reward_std": 0.5043560986717542, "rewards/equation_reward_func": 0.4652777872979641, "rewards/format_reward_func": 0.8263889054457346, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 221.31945164998373, "epoch": 0.4088328075709779, "grad_norm": 2.746698671118404, "kl": 2.4767252604166665, "learning_rate": 4.999527985734931e-07, "loss": 0.1176, "reward": 1.3958333681027095, "reward_std": 0.4606535832087199, "rewards/equation_reward_func": 0.5486111280818781, "rewards/format_reward_func": 0.8472222437461218, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 265.2291742960612, "epoch": 0.41261829652996845, "grad_norm": 8.47115489110944, "kl": 2.67626953125, "learning_rate": 4.999474084761293e-07, "loss": 0.1801, "reward": 1.4375000496705372, "reward_std": 0.4704290193816026, "rewards/equation_reward_func": 0.5763889029622078, "rewards/format_reward_func": 0.8611111293236414, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 216.90278498331705, "epoch": 0.416403785488959, "grad_norm": 2.6419184728296528, "kl": 1.8723958333333333, "learning_rate": 4.999417270640898e-07, "loss": 0.0151, "reward": 1.3125000447034836, "reward_std": 0.5176352287332217, "rewards/equation_reward_func": 0.4513888967533906, "rewards/format_reward_func": 0.8611111342906952, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 237.16667556762695, "epoch": 0.4201892744479495, "grad_norm": 3.5632903162508525, "kl": 1.9169108072916667, "learning_rate": 4.999357543439968e-07, "loss": 0.2532, "reward": 1.3263889302810032, "reward_std": 0.46584198499719304, "rewards/equation_reward_func": 0.44444444961845875, "rewards/format_reward_func": 0.8819444676240286, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 250.10417048136392, "epoch": 0.42397476340694007, "grad_norm": 2.836432685345919, "kl": 2.11572265625, "learning_rate": 4.999294903228113e-07, "loss": 0.0877, "reward": 1.3541666964689891, "reward_std": 0.5378451521197954, "rewards/equation_reward_func": 0.5347222381581863, "rewards/format_reward_func": 0.8194444676240286, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 256.95139439900714, "epoch": 0.4277602523659306, "grad_norm": 15.018378327595181, "kl": 8.984842936197916, "learning_rate": 4.999229350078339e-07, "loss": 0.116, "reward": 1.4513889253139496, "reward_std": 0.4579727239906788, "rewards/equation_reward_func": 0.562500019868215, "rewards/format_reward_func": 0.8888889054457346, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 249.65973154703775, "epoch": 0.43154574132492113, "grad_norm": 2.9312130360707225, "kl": 1.6197916666666667, "learning_rate": 4.99916088406705e-07, "loss": 0.1031, "reward": 1.4722222586472828, "reward_std": 0.491986704369386, "rewards/equation_reward_func": 0.5763889017204443, "rewards/format_reward_func": 0.895833358168602, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 237.29167048136392, "epoch": 0.4353312302839117, "grad_norm": 318.50068999038837, "kl": 10.434326171875, "learning_rate": 4.999089505274044e-07, "loss": 0.073, "reward": 1.326388920346896, "reward_std": 0.42563923199971515, "rewards/equation_reward_func": 0.4375000149011612, "rewards/format_reward_func": 0.8888889153798422, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 247.90973027547201, "epoch": 0.4391167192429022, "grad_norm": 4.524241932647995, "kl": 1.73681640625, "learning_rate": 4.999015213782511e-07, "loss": 0.0973, "reward": 1.4375000496705372, "reward_std": 0.5439534323910872, "rewards/equation_reward_func": 0.6597222462296486, "rewards/format_reward_func": 0.7777777959903082, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 241.83334096272787, "epoch": 0.44290220820189274, "grad_norm": 2.1566548938944345, "kl": 6.7578125, "learning_rate": 4.998938009679042e-07, "loss": 0.0664, "reward": 1.4027778307596843, "reward_std": 0.5103383002181848, "rewards/equation_reward_func": 0.5972222362955412, "rewards/format_reward_func": 0.8055555820465088, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 265.61112213134766, "epoch": 0.4466876971608833, "grad_norm": 10.590567425193024, "kl": 1.2277018229166667, "learning_rate": 4.998857893053613e-07, "loss": 0.0824, "reward": 1.4166666964689891, "reward_std": 0.5057607839504877, "rewards/equation_reward_func": 0.5555555745959282, "rewards/format_reward_func": 0.8611111342906952, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 270.2361195882161, "epoch": 0.4504731861198738, "grad_norm": 2.807056262560773, "kl": 2.3780517578125, "learning_rate": 4.998774863999605e-07, "loss": 0.1143, "reward": 1.3888889302810032, "reward_std": 0.38816434393326443, "rewards/equation_reward_func": 0.5138889023413261, "rewards/format_reward_func": 0.8750000149011612, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 282.70834096272785, "epoch": 0.45425867507886436, "grad_norm": 3.1612686868215154, "kl": 1.0327962239583333, "learning_rate": 4.998688922613787e-07, "loss": 0.0685, "reward": 1.4305555919806163, "reward_std": 0.5549860845009486, "rewards/equation_reward_func": 0.6041666784634193, "rewards/format_reward_func": 0.8263889104127884, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 215.09722646077475, "epoch": 0.4580441640378549, "grad_norm": 2.621718223845123, "kl": 6.5284423828125, "learning_rate": 4.998600068996324e-07, "loss": 0.099, "reward": 1.319444477558136, "reward_std": 0.42932410165667534, "rewards/equation_reward_func": 0.5000000211099783, "rewards/format_reward_func": 0.8194444626569748, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 264.2569529215495, "epoch": 0.4618296529968454, "grad_norm": 4.352484643043419, "kl": 0.9919026692708334, "learning_rate": 4.998508303250775e-07, "loss": 0.0482, "reward": 1.48611115415891, "reward_std": 0.5396140466133753, "rewards/equation_reward_func": 0.6041666865348816, "rewards/format_reward_func": 0.8819444527228674, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 243.7986183166504, "epoch": 0.465615141955836, "grad_norm": 5.13544206444599, "kl": 2.6465657552083335, "learning_rate": 4.998413625484094e-07, "loss": 0.1093, "reward": 1.2500000298023224, "reward_std": 0.4689197850724061, "rewards/equation_reward_func": 0.43055556155741215, "rewards/format_reward_func": 0.8194444676240286, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 251.54167302449545, "epoch": 0.4694006309148265, "grad_norm": 2.679299182739803, "kl": 0.9150797526041666, "learning_rate": 4.998316035806628e-07, "loss": 0.1428, "reward": 1.3888889253139496, "reward_std": 0.47515800098578137, "rewards/equation_reward_func": 0.5486111318071684, "rewards/format_reward_func": 0.8402777910232544, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 268.2152837117513, "epoch": 0.47318611987381703, "grad_norm": 1.8553455629336253, "kl": 1.0334065755208333, "learning_rate": 4.998215534332118e-07, "loss": 0.0777, "reward": 1.4861111640930176, "reward_std": 0.40932964409391087, "rewards/equation_reward_func": 0.6250000124176344, "rewards/format_reward_func": 0.8611111293236414, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 267.91667556762695, "epoch": 0.4769716088328076, "grad_norm": 2.3104189096021246, "kl": 1.7921549479166667, "learning_rate": 4.998112121177698e-07, "loss": 0.0391, "reward": 1.3888889253139496, "reward_std": 0.4704259845117728, "rewards/equation_reward_func": 0.5138889048248529, "rewards/format_reward_func": 0.8750000149011612, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 244.9861183166504, "epoch": 0.4807570977917981, "grad_norm": 5.244107644705432, "kl": 3.0703328450520835, "learning_rate": 4.9980057964639e-07, "loss": 0.1144, "reward": 1.3680555870135624, "reward_std": 0.3652517894903819, "rewards/equation_reward_func": 0.4444444552063942, "rewards/format_reward_func": 0.9236111243565878, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 242.57639694213867, "epoch": 0.48454258675078865, "grad_norm": 3.0461805649624036, "kl": 0.55810546875, "learning_rate": 4.99789656031464e-07, "loss": 0.1251, "reward": 1.451388915379842, "reward_std": 0.43821969131628674, "rewards/equation_reward_func": 0.5416666828095913, "rewards/format_reward_func": 0.9097222437461218, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 268.93056360880536, "epoch": 0.48832807570977915, "grad_norm": 2.7875837232126814, "kl": 19.790120442708332, "learning_rate": 4.997784412857239e-07, "loss": 0.1328, "reward": 1.4444445073604584, "reward_std": 0.481424443423748, "rewards/equation_reward_func": 0.5625000136593977, "rewards/format_reward_func": 0.881944457689921, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 243.06944783528647, "epoch": 0.4921135646687697, "grad_norm": 3.3379512405703986, "kl": 2.3311360677083335, "learning_rate": 4.997669354222401e-07, "loss": 0.0831, "reward": 1.4444444874922435, "reward_std": 0.48848551760117215, "rewards/equation_reward_func": 0.5555555758376917, "rewards/format_reward_func": 0.8888889104127884, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 257.15972900390625, "epoch": 0.49589905362776027, "grad_norm": 5.920825260861832, "kl": 2.1470540364583335, "learning_rate": 4.99755138454423e-07, "loss": 0.0901, "reward": 1.4166667014360428, "reward_std": 0.40707051381468773, "rewards/equation_reward_func": 0.5208333519597849, "rewards/format_reward_func": 0.8958333532015482, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 275.7361208597819, "epoch": 0.49968454258675077, "grad_norm": 119.84927693026204, "kl": 16.074625651041668, "learning_rate": 4.997430503960219e-07, "loss": 0.1126, "reward": 1.4236111442248027, "reward_std": 0.44205466161171597, "rewards/equation_reward_func": 0.5347222350537777, "rewards/format_reward_func": 0.8888889104127884, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 283.8333396911621, "epoch": 0.5034700315457413, "grad_norm": 2.3654173997862147, "kl": 1.8214518229166667, "learning_rate": 4.997306712611255e-07, "loss": 0.1992, "reward": 1.4097222487131755, "reward_std": 0.4522901251912117, "rewards/equation_reward_func": 0.5416666778425375, "rewards/format_reward_func": 0.8680555721124014, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 255.3055648803711, "epoch": 0.5072555205047319, "grad_norm": 8.674419859591838, "kl": 1.6661783854166667, "learning_rate": 4.997180010641617e-07, "loss": 0.0642, "reward": 1.4236111640930176, "reward_std": 0.4788891275723775, "rewards/equation_reward_func": 0.5277777922650179, "rewards/format_reward_func": 0.8958333532015482, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 258.4166742960612, "epoch": 0.5110410094637224, "grad_norm": 4.561358423608036, "kl": 1.7342122395833333, "learning_rate": 4.997050398198976e-07, "loss": 0.008, "reward": 1.3125000496705372, "reward_std": 0.4775065655509631, "rewards/equation_reward_func": 0.39583334140479565, "rewards/format_reward_func": 0.9166666815678278, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 295.40278752644855, "epoch": 0.5148264984227129, "grad_norm": 3.607344267562202, "kl": 1.3319905598958333, "learning_rate": 4.996917875434397e-07, "loss": 0.0834, "reward": 1.36111115415891, "reward_std": 0.4735433558622996, "rewards/equation_reward_func": 0.46527778419355553, "rewards/format_reward_func": 0.8958333532015482, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 261.59722900390625, "epoch": 0.5186119873817034, "grad_norm": 2.5043328754943537, "kl": 1.7516276041666667, "learning_rate": 4.996782442502337e-07, "loss": 0.1104, "reward": 1.3750000496705372, "reward_std": 0.49132541194558144, "rewards/equation_reward_func": 0.49305557397504646, "rewards/format_reward_func": 0.8819444527228674, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 286.08334096272785, "epoch": 0.522397476340694, "grad_norm": 6.295893654204792, "kl": 4.499348958333333, "learning_rate": 4.996644099560641e-07, "loss": 0.1441, "reward": 1.4722222685813904, "reward_std": 0.5132550907631716, "rewards/equation_reward_func": 0.5833333507180214, "rewards/format_reward_func": 0.8888889153798422, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 273.93056360880536, "epoch": 0.5261829652996846, "grad_norm": 7.804266363603045, "kl": 1.0720621744791667, "learning_rate": 4.996502846770549e-07, "loss": 0.1438, "reward": 1.3402778059244156, "reward_std": 0.4476064319411914, "rewards/equation_reward_func": 0.4305555584530036, "rewards/format_reward_func": 0.9097222338120142, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 240.97222900390625, "epoch": 0.5299684542586751, "grad_norm": 224.30396185994454, "kl": 22.011637369791668, "learning_rate": 4.996358684296693e-07, "loss": 0.1255, "reward": 1.3680556019147236, "reward_std": 0.41704921424388885, "rewards/equation_reward_func": 0.45138889985779923, "rewards/format_reward_func": 0.9166666815678278, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 245.88889694213867, "epoch": 0.5337539432176656, "grad_norm": 2.9359173813915618, "kl": 4.063395182291667, "learning_rate": 4.996211612307092e-07, "loss": 0.1143, "reward": 1.3333333482344945, "reward_std": 0.4269623930255572, "rewards/equation_reward_func": 0.4791666803260644, "rewards/format_reward_func": 0.8541666865348816, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 288.9722264607747, "epoch": 0.5375394321766561, "grad_norm": 2.754515790547288, "kl": 1.142822265625, "learning_rate": 4.996061630973162e-07, "loss": 0.1758, "reward": 1.4722222636143367, "reward_std": 0.38450759773453075, "rewards/equation_reward_func": 0.5763889042039713, "rewards/format_reward_func": 0.8958333482344946, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 273.65278752644855, "epoch": 0.5413249211356467, "grad_norm": 3.6074116845363675, "kl": 62.014078776041664, "learning_rate": 4.995908740469706e-07, "loss": 0.2716, "reward": 1.3750000447034836, "reward_std": 0.4357808977365494, "rewards/equation_reward_func": 0.45833334140479565, "rewards/format_reward_func": 0.9166666815678278, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 254.61111958821616, "epoch": 0.5451104100946372, "grad_norm": 2.833549076551859, "kl": 0.9168701171875, "learning_rate": 4.995752940974918e-07, "loss": 0.1139, "reward": 1.4652778108914692, "reward_std": 0.500111423432827, "rewards/equation_reward_func": 0.5416666865348816, "rewards/format_reward_func": 0.9236111293236414, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 266.50000890096027, "epoch": 0.5488958990536278, "grad_norm": 2.0285819176753637, "kl": 0.7223714192708334, "learning_rate": 4.995594232670383e-07, "loss": 0.0795, "reward": 1.5000000298023224, "reward_std": 0.3858482278883457, "rewards/equation_reward_func": 0.5902777904023727, "rewards/format_reward_func": 0.909722238779068, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 242.50000953674316, "epoch": 0.5526813880126183, "grad_norm": 2.469130613713446, "kl": 7.028157552083333, "learning_rate": 4.995432615741076e-07, "loss": 0.0928, "reward": 1.5208333730697632, "reward_std": 0.3851733220120271, "rewards/equation_reward_func": 0.5972222313284874, "rewards/format_reward_func": 0.9236111293236414, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 256.7430610656738, "epoch": 0.5564668769716088, "grad_norm": 2.818264802652981, "kl": 0.8765869140625, "learning_rate": 4.995268090375362e-07, "loss": 0.134, "reward": 1.4930555919806163, "reward_std": 0.4773927927017212, "rewards/equation_reward_func": 0.6041666840513548, "rewards/format_reward_func": 0.8888889054457346, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 269.722230275472, "epoch": 0.5602523659305993, "grad_norm": 2.6290072054683082, "kl": 1.646728515625, "learning_rate": 4.995100656764996e-07, "loss": 0.111, "reward": 1.3402778108914692, "reward_std": 0.45711999386548996, "rewards/equation_reward_func": 0.430555568387111, "rewards/format_reward_func": 0.9097222437461218, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 283.94445419311523, "epoch": 0.5640378548895899, "grad_norm": 3.52286689241144, "kl": 1.5117594401041667, "learning_rate": 4.994930315105124e-07, "loss": 0.1291, "reward": 1.4722222586472828, "reward_std": 0.4221850348015626, "rewards/equation_reward_func": 0.5763889029622078, "rewards/format_reward_func": 0.8958333532015482, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 260.83334096272785, "epoch": 0.5678233438485805, "grad_norm": 3.048640806478669, "kl": 8.982340494791666, "learning_rate": 4.994757065594279e-07, "loss": 0.1167, "reward": 1.4236111442248027, "reward_std": 0.4365849755704403, "rewards/equation_reward_func": 0.534722238779068, "rewards/format_reward_func": 0.8888889104127884, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 251.4861208597819, "epoch": 0.571608832807571, "grad_norm": 5.880882126873241, "kl": 2.01953125, "learning_rate": 4.994580908434383e-07, "loss": 0.2153, "reward": 1.3750000298023224, "reward_std": 0.4684516203900178, "rewards/equation_reward_func": 0.47916667846341926, "rewards/format_reward_func": 0.8958333482344946, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 308.96528244018555, "epoch": 0.5753943217665615, "grad_norm": 7.940563386747667, "kl": 2.2464192708333335, "learning_rate": 4.994401843830749e-07, "loss": 0.2154, "reward": 1.2638889352480571, "reward_std": 0.516243410607179, "rewards/equation_reward_func": 0.41666668343047303, "rewards/format_reward_func": 0.8472222437461218, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 232.1180623372396, "epoch": 0.579179810725552, "grad_norm": 305.4523441721458, "kl": 29.108561197916668, "learning_rate": 4.994219871992076e-07, "loss": 0.2207, "reward": 1.4375000298023224, "reward_std": 0.45513641958435375, "rewards/equation_reward_func": 0.5277778009573618, "rewards/format_reward_func": 0.909722238779068, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 264.7777862548828, "epoch": 0.5829652996845426, "grad_norm": 2.0715581627005784, "kl": 1.160400390625, "learning_rate": 4.994034993130455e-07, "loss": 0.1089, "reward": 1.3958333730697632, "reward_std": 0.3590660902361075, "rewards/equation_reward_func": 0.47222223443289596, "rewards/format_reward_func": 0.9236111243565878, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 257.13195419311523, "epoch": 0.5867507886435331, "grad_norm": 3.5395487394835476, "kl": 1.2493489583333333, "learning_rate": 4.993847207461362e-07, "loss": 0.1119, "reward": 1.3194444924592972, "reward_std": 0.40260318542520207, "rewards/equation_reward_func": 0.4236111293236415, "rewards/format_reward_func": 0.8958333532015482, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 248.52778498331705, "epoch": 0.5905362776025237, "grad_norm": 3.948194000938186, "kl": 1.1299641927083333, "learning_rate": 4.993656515203662e-07, "loss": 0.1778, "reward": 1.3819444874922435, "reward_std": 0.39707954103748005, "rewards/equation_reward_func": 0.465277789781491, "rewards/format_reward_func": 0.9166666766007742, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 251.06250508626303, "epoch": 0.5943217665615141, "grad_norm": 3.3126376703610187, "kl": 1.5913899739583333, "learning_rate": 4.993462916579606e-07, "loss": 0.1415, "reward": 1.4027778059244156, "reward_std": 0.415769978115956, "rewards/equation_reward_func": 0.5069444589316845, "rewards/format_reward_func": 0.8958333532015482, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 267.7361195882161, "epoch": 0.5981072555205047, "grad_norm": 3.6712572603171045, "kl": 1.0328776041666667, "learning_rate": 4.993266411814837e-07, "loss": 0.1356, "reward": 1.5138889253139496, "reward_std": 0.43073243647813797, "rewards/equation_reward_func": 0.6180555745959282, "rewards/format_reward_func": 0.8958333482344946, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 260.7569516499837, "epoch": 0.6018927444794953, "grad_norm": 2.898300493316585, "kl": 1.694091796875, "learning_rate": 4.993067001138379e-07, "loss": 0.1933, "reward": 1.3958333830038707, "reward_std": 0.45616808036963147, "rewards/equation_reward_func": 0.5138889054457346, "rewards/format_reward_func": 0.881944457689921, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 243.38889439900717, "epoch": 0.6056782334384858, "grad_norm": 2.3012954584762206, "kl": 1.1136881510416667, "learning_rate": 4.992864684782648e-07, "loss": 0.0314, "reward": 1.423611159125964, "reward_std": 0.4477810760339101, "rewards/equation_reward_func": 0.4930555696288745, "rewards/format_reward_func": 0.9305555721124014, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 252.68750635782877, "epoch": 0.6094637223974764, "grad_norm": 10.558231881280353, "kl": 7.27197265625, "learning_rate": 4.992659462983445e-07, "loss": 0.1837, "reward": 1.4444444874922435, "reward_std": 0.4468059837818146, "rewards/equation_reward_func": 0.5416666797051827, "rewards/format_reward_func": 0.9027777959903082, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 223.90278244018555, "epoch": 0.6132492113564669, "grad_norm": 8.24083470994998, "kl": 1.4090983072916667, "learning_rate": 4.992451335979955e-07, "loss": 0.0984, "reward": 1.4513889253139496, "reward_std": 0.34703291207551956, "rewards/equation_reward_func": 0.5000000136593977, "rewards/format_reward_func": 0.951388900478681, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 246.9583396911621, "epoch": 0.6170347003154574, "grad_norm": 3.165272632330998, "kl": 1.4227701822916667, "learning_rate": 4.992240304014751e-07, "loss": 0.0434, "reward": 1.381944477558136, "reward_std": 0.3748237465818723, "rewards/equation_reward_func": 0.4583333457509677, "rewards/format_reward_func": 0.9236111243565878, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 248.64584477742514, "epoch": 0.6208201892744479, "grad_norm": 3.802073252683938, "kl": 1.0417887369791667, "learning_rate": 4.992026367333793e-07, "loss": 0.0662, "reward": 1.5347222487131755, "reward_std": 0.3855091730753581, "rewards/equation_reward_func": 0.6111111243565878, "rewards/format_reward_func": 0.9236111243565878, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 251.90278498331705, "epoch": 0.6246056782334385, "grad_norm": 3.1898348924774695, "kl": 1.318359375, "learning_rate": 4.991809526186423e-07, "loss": 0.1018, "reward": 1.4930555919806163, "reward_std": 0.4848398119211197, "rewards/equation_reward_func": 0.5694444676240286, "rewards/format_reward_func": 0.9236111293236414, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 270.7916781107585, "epoch": 0.628391167192429, "grad_norm": 48.65745253251759, "kl": 9.658447265625, "learning_rate": 4.991589780825373e-07, "loss": 0.2243, "reward": 1.6180556019147236, "reward_std": 0.38904641941189766, "rewards/equation_reward_func": 0.722222238779068, "rewards/format_reward_func": 0.8958333532015482, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 247.05556360880533, "epoch": 0.6321766561514196, "grad_norm": 12.85876415564074, "kl": 2.3059895833333335, "learning_rate": 4.991367131506753e-07, "loss": 0.0952, "reward": 1.4930555919806163, "reward_std": 0.44105598827203113, "rewards/equation_reward_func": 0.5902777935067812, "rewards/format_reward_func": 0.9027777959903082, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 252.79861958821616, "epoch": 0.63596214511041, "grad_norm": 3.766932765553029, "kl": 1.00732421875, "learning_rate": 4.991141578490066e-07, "loss": 0.108, "reward": 1.4305556019147236, "reward_std": 0.4160829931497574, "rewards/equation_reward_func": 0.5138888967533907, "rewards/format_reward_func": 0.9166666766007742, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 257.6805610656738, "epoch": 0.6397476340694006, "grad_norm": 8.363557603327017, "kl": 2.90673828125, "learning_rate": 4.990913122038193e-07, "loss": 0.0988, "reward": 1.506944477558136, "reward_std": 0.4711163180569808, "rewards/equation_reward_func": 0.5833333494762579, "rewards/format_reward_func": 0.9236111293236414, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 278.05556360880536, "epoch": 0.6435331230283912, "grad_norm": 2.4831862429823874, "kl": 1.1470540364583333, "learning_rate": 4.9906817624174e-07, "loss": 0.1149, "reward": 1.4583333780368168, "reward_std": 0.40201255182425183, "rewards/equation_reward_func": 0.5486111318071684, "rewards/format_reward_func": 0.9097222338120142, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 270.8125114440918, "epoch": 0.6473186119873817, "grad_norm": 96.69755111218885, "kl": 18.217529296875, "learning_rate": 4.990447499897339e-07, "loss": 0.1482, "reward": 1.4166666964689891, "reward_std": 0.4657805400590102, "rewards/equation_reward_func": 0.500000019868215, "rewards/format_reward_func": 0.9166666865348816, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 280.9513982137044, "epoch": 0.6511041009463723, "grad_norm": 4.4626269454999035, "kl": 1.0166829427083333, "learning_rate": 4.990210334751042e-07, "loss": 0.2191, "reward": 1.4305555919806163, "reward_std": 0.5064363280932108, "rewards/equation_reward_func": 0.5208333445092043, "rewards/format_reward_func": 0.909722238779068, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 290.0277862548828, "epoch": 0.6548895899053627, "grad_norm": 42.001704471875875, "kl": 7.866048177083333, "learning_rate": 4.989970267254928e-07, "loss": 0.3399, "reward": 1.37500003973643, "reward_std": 0.4621751358111699, "rewards/equation_reward_func": 0.5138889104127884, "rewards/format_reward_func": 0.8611111243565878, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 287.0277849833171, "epoch": 0.6586750788643533, "grad_norm": 401.0064206569611, "kl": 13.825358072916666, "learning_rate": 4.989727297688796e-07, "loss": 0.2614, "reward": 1.4930555919806163, "reward_std": 0.48149604598681134, "rewards/equation_reward_func": 0.6319444614152113, "rewards/format_reward_func": 0.8611111342906952, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 267.2222277323405, "epoch": 0.6624605678233438, "grad_norm": 5.58193017827173, "kl": 1.5638020833333333, "learning_rate": 4.989481426335828e-07, "loss": 0.2184, "reward": 1.4791667064030964, "reward_std": 0.32900576541821164, "rewards/equation_reward_func": 0.583333345130086, "rewards/format_reward_func": 0.8958333482344946, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 312.2222315470378, "epoch": 0.6662460567823344, "grad_norm": 2.903611804665768, "kl": 1.7395833333333333, "learning_rate": 4.989232653482587e-07, "loss": 0.2021, "reward": 1.4305555919806163, "reward_std": 0.4162732983628909, "rewards/equation_reward_func": 0.5486111268401146, "rewards/format_reward_func": 0.8819444676240286, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 295.96528752644855, "epoch": 0.670031545741325, "grad_norm": 8.614948807031883, "kl": 1.4444986979166667, "learning_rate": 4.98898097941902e-07, "loss": 0.2504, "reward": 1.3194444825251896, "reward_std": 0.3698546774685383, "rewards/equation_reward_func": 0.43750001179675263, "rewards/format_reward_func": 0.881944457689921, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 296.6111208597819, "epoch": 0.6738170347003155, "grad_norm": 18.17568858303832, "kl": 4.43408203125, "learning_rate": 4.988726404438453e-07, "loss": 0.2654, "reward": 1.2569444924592972, "reward_std": 0.5792658850550652, "rewards/equation_reward_func": 0.43750001303851604, "rewards/format_reward_func": 0.8194444626569748, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 297.1527862548828, "epoch": 0.677602523659306, "grad_norm": 3.3997948870685444, "kl": 2.4781901041666665, "learning_rate": 4.988468928837595e-07, "loss": 0.2077, "reward": 1.4027778307596843, "reward_std": 0.43186015884081524, "rewards/equation_reward_func": 0.5625000142802795, "rewards/format_reward_func": 0.8402777959903082, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 259.7013969421387, "epoch": 0.6813880126182965, "grad_norm": 3.261696651794849, "kl": 2.1082763671875, "learning_rate": 4.988208552916535e-07, "loss": 0.1781, "reward": 1.388888920346896, "reward_std": 0.4762779163817565, "rewards/equation_reward_func": 0.5069444607943296, "rewards/format_reward_func": 0.881944457689921, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 285.75695673624676, "epoch": 0.6851735015772871, "grad_norm": 436.9538386873056, "kl": 90.0078125, "learning_rate": 4.987945276978741e-07, "loss": 0.6442, "reward": 1.2361111342906952, "reward_std": 0.47308399528265, "rewards/equation_reward_func": 0.3888889054457347, "rewards/format_reward_func": 0.8472222437461218, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 253.26389821370444, "epoch": 0.6889589905362776, "grad_norm": 6.275698981154313, "kl": 1.6064453125, "learning_rate": 4.987679101331063e-07, "loss": 0.2335, "reward": 1.4861111442248027, "reward_std": 0.4897613674402237, "rewards/equation_reward_func": 0.6041666890184084, "rewards/format_reward_func": 0.8819444626569748, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 225.61111958821616, "epoch": 0.6927444794952682, "grad_norm": 5.577871979120166, "kl": 0.7556966145833334, "learning_rate": 4.987410026283729e-07, "loss": 0.1068, "reward": 1.48611115415891, "reward_std": 0.5080769136548042, "rewards/equation_reward_func": 0.5763889054457346, "rewards/format_reward_func": 0.9097222437461218, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 208.28472900390625, "epoch": 0.6965299684542586, "grad_norm": 34.81645021530138, "kl": 5.219563802083333, "learning_rate": 4.98713805215035e-07, "loss": 0.1549, "reward": 1.4583333830038707, "reward_std": 0.40722255781292915, "rewards/equation_reward_func": 0.5625000142802795, "rewards/format_reward_func": 0.8958333532015482, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 267.7986183166504, "epoch": 0.7003154574132492, "grad_norm": 10.388065090744742, "kl": 10.697916666666666, "learning_rate": 4.986863179247908e-07, "loss": 0.1906, "reward": 1.3750000447034836, "reward_std": 0.47181837012370426, "rewards/equation_reward_func": 0.5069444558272759, "rewards/format_reward_func": 0.8680555721124014, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 229.8611157735189, "epoch": 0.7041009463722397, "grad_norm": 4.562876059825846, "kl": 4.035807291666667, "learning_rate": 4.986585407896771e-07, "loss": 0.223, "reward": 1.4027778208255768, "reward_std": 0.5173191850384077, "rewards/equation_reward_func": 0.5486111293236414, "rewards/format_reward_func": 0.854166696468989, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 229.1666742960612, "epoch": 0.7078864353312303, "grad_norm": 6.466655997110351, "kl": 758.0651041666666, "learning_rate": 4.986304738420683e-07, "loss": 0.4869, "reward": 1.4305555820465088, "reward_std": 0.4751903774837653, "rewards/equation_reward_func": 0.5763888955116272, "rewards/format_reward_func": 0.8541666865348816, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 251.9236208597819, "epoch": 0.7116719242902209, "grad_norm": 49.29790482270018, "kl": 13.262369791666666, "learning_rate": 4.986021171146764e-07, "loss": 0.3513, "reward": 1.354166716337204, "reward_std": 0.5414688164989153, "rewards/equation_reward_func": 0.5138889054457346, "rewards/format_reward_func": 0.8402777959903082, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 254.1666774749756, "epoch": 0.7154574132492113, "grad_norm": 5.643615815413666, "kl": 7.41162109375, "learning_rate": 4.985734706405516e-07, "loss": 0.2591, "reward": 1.2777778059244156, "reward_std": 0.4625398740172386, "rewards/equation_reward_func": 0.4513889004786809, "rewards/format_reward_func": 0.8263889203468958, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 289.9583396911621, "epoch": 0.7192429022082019, "grad_norm": 304.8418060986503, "kl": 665.8196614583334, "learning_rate": 4.98544534453081e-07, "loss": 1.0021, "reward": 1.2708333830038707, "reward_std": 0.4970496619741122, "rewards/equation_reward_func": 0.534722234432896, "rewards/format_reward_func": 0.736111139257749, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 247.37500635782877, "epoch": 0.7230283911671924, "grad_norm": 11.586701386430356, "kl": 8.091145833333334, "learning_rate": 4.985153085859902e-07, "loss": 0.2491, "reward": 1.43750003973643, "reward_std": 0.5147989491621653, "rewards/equation_reward_func": 0.6458333432674408, "rewards/format_reward_func": 0.7916666865348816, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 264.2847315470378, "epoch": 0.726813880126183, "grad_norm": 9.752593632001947, "kl": 11.559244791666666, "learning_rate": 4.984857930733419e-07, "loss": 0.3493, "reward": 1.1111111392577488, "reward_std": 0.47952866181731224, "rewards/equation_reward_func": 0.347222230086724, "rewards/format_reward_func": 0.7638889104127884, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 269.0902849833171, "epoch": 0.7305993690851735, "grad_norm": 9.316145758908815, "kl": 11.126953125, "learning_rate": 4.984559879495366e-07, "loss": 0.3237, "reward": 1.201388920346896, "reward_std": 0.6368941242496172, "rewards/equation_reward_func": 0.4861111231148243, "rewards/format_reward_func": 0.7152778009573618, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 255.90972900390625, "epoch": 0.7343848580441641, "grad_norm": 8.827779574351993, "kl": 121.453125, "learning_rate": 4.984258932493123e-07, "loss": 0.5189, "reward": 1.2638889352480571, "reward_std": 0.5239984119931856, "rewards/equation_reward_func": 0.5555555671453476, "rewards/format_reward_func": 0.708333358168602, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 237.27084159851074, "epoch": 0.7381703470031545, "grad_norm": 8.013012272149158, "kl": 24.984375, "learning_rate": 4.983955090077444e-07, "loss": 0.2832, "reward": 1.1597222437461217, "reward_std": 0.5535530770818392, "rewards/equation_reward_func": 0.48611112497746944, "rewards/format_reward_func": 0.6736111268401146, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 249.63889821370444, "epoch": 0.7419558359621451, "grad_norm": 438.51211315014166, "kl": 126.08072916666667, "learning_rate": 4.983648352602459e-07, "loss": 0.3395, "reward": 1.1250000298023224, "reward_std": 0.6015344088276228, "rewards/equation_reward_func": 0.4930555646618207, "rewards/format_reward_func": 0.6319444750746092, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 195.81250508626303, "epoch": 0.7457413249211357, "grad_norm": 13.870204564822584, "kl": 9.074869791666666, "learning_rate": 4.983338720425672e-07, "loss": 0.2873, "reward": 1.1805555770794551, "reward_std": 0.6060735906163851, "rewards/equation_reward_func": 0.4583333407839139, "rewards/format_reward_func": 0.7222222437461218, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 254.87500699361166, "epoch": 0.7495268138801262, "grad_norm": 82.14394465970908, "kl": 38.481770833333336, "learning_rate": 4.98302619390796e-07, "loss": 0.3067, "reward": 1.1250000298023224, "reward_std": 0.4945492781698704, "rewards/equation_reward_func": 0.5277777904023727, "rewards/format_reward_func": 0.5972222425043583, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 221.51389439900717, "epoch": 0.7533123028391168, "grad_norm": 17.556835883262877, "kl": 97.25, "learning_rate": 4.982710773413576e-07, "loss": 0.3719, "reward": 1.131944477558136, "reward_std": 0.588702150930961, "rewards/equation_reward_func": 0.5763889079292616, "rewards/format_reward_func": 0.555555577079455, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 197.82639376322427, "epoch": 0.7570977917981072, "grad_norm": 26.30378944955965, "kl": 17.8984375, "learning_rate": 4.98239245931014e-07, "loss": 0.3139, "reward": 1.1805555870135624, "reward_std": 0.5916161189476649, "rewards/equation_reward_func": 0.5902777959903082, "rewards/format_reward_func": 0.5902777959903082, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 222.54861704508463, "epoch": 0.7608832807570978, "grad_norm": 10.696318069471166, "kl": 14.2109375, "learning_rate": 4.982071251968652e-07, "loss": 0.2388, "reward": 1.1041666964689891, "reward_std": 0.5821270644664764, "rewards/equation_reward_func": 0.5069444638987383, "rewards/format_reward_func": 0.5972222437461218, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 219.2916742960612, "epoch": 0.7646687697160883, "grad_norm": 132.22587525968703, "kl": 40.453125, "learning_rate": 4.981747151763478e-07, "loss": 0.2509, "reward": 1.0208333631356556, "reward_std": 0.6254869078596433, "rewards/equation_reward_func": 0.493055568387111, "rewards/format_reward_func": 0.5277777959903082, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 213.25000445048013, "epoch": 0.7684542586750789, "grad_norm": 53.75808201656059, "kl": 28.166666666666668, "learning_rate": 4.981420159072359e-07, "loss": 0.3216, "reward": 0.923611139257749, "reward_std": 0.5980016005535921, "rewards/equation_reward_func": 0.39583334513008595, "rewards/format_reward_func": 0.5277777860562006, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 255.5486208597819, "epoch": 0.7722397476340694, "grad_norm": 76.3126195344439, "kl": 24.140625, "learning_rate": 4.981090274276405e-07, "loss": 0.2661, "reward": 1.0833333681027095, "reward_std": 0.6427489096919695, "rewards/equation_reward_func": 0.5833333482344946, "rewards/format_reward_func": 0.500000019868215, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 223.21528244018555, "epoch": 0.7760252365930599, "grad_norm": 13589.418456534844, "kl": 1149.7135416666667, "learning_rate": 4.9807574977601e-07, "loss": 2.3024, "reward": 0.9375000447034836, "reward_std": 0.6095106812814871, "rewards/equation_reward_func": 0.42361112497746944, "rewards/format_reward_func": 0.5138889029622078, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 227.54167366027832, "epoch": 0.7798107255520504, "grad_norm": 33.78184675982937, "kl": 29.5703125, "learning_rate": 4.980421829911295e-07, "loss": 0.269, "reward": 0.8541667014360428, "reward_std": 0.6479750176270803, "rewards/equation_reward_func": 0.43055556900799274, "rewards/format_reward_func": 0.4236111268401146, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 274.4166742960612, "epoch": 0.783596214511041, "grad_norm": 17.81633266386669, "kl": 28.666666666666668, "learning_rate": 4.980083271121214e-07, "loss": 0.3345, "reward": 0.909722238779068, "reward_std": 0.6108483547965685, "rewards/equation_reward_func": 0.5277777959903082, "rewards/format_reward_func": 0.3819444576899211, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 251.74306106567383, "epoch": 0.7873817034700316, "grad_norm": 22.608432736994907, "kl": 51.177083333333336, "learning_rate": 4.979741821784445e-07, "loss": 0.2628, "reward": 0.8680555870135626, "reward_std": 0.6757829288641611, "rewards/equation_reward_func": 0.4583333457509677, "rewards/format_reward_func": 0.40972222822407883, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 265.33334096272785, "epoch": 0.7911671924290221, "grad_norm": 13.10404823539201, "kl": 27.53125, "learning_rate": 4.979397482298952e-07, "loss": 0.3222, "reward": 0.7916666939854622, "reward_std": 0.619778610765934, "rewards/equation_reward_func": 0.38194445582727593, "rewards/format_reward_func": 0.4097222313284874, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 245.4513931274414, "epoch": 0.7949526813880127, "grad_norm": 17.285612572481327, "kl": 25.333333333333332, "learning_rate": 4.979050253066063e-07, "loss": 0.2375, "reward": 0.937500019868215, "reward_std": 0.5681246320406595, "rewards/equation_reward_func": 0.5000000136593977, "rewards/format_reward_func": 0.4375000074505806, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 247.6736183166504, "epoch": 0.7987381703470031, "grad_norm": 114.10864728746037, "kl": 68.94791666666667, "learning_rate": 4.978700134490473e-07, "loss": 0.3221, "reward": 0.9861111293236414, "reward_std": 0.6230639989177386, "rewards/equation_reward_func": 0.4791666778425376, "rewards/format_reward_func": 0.5069444589316845, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 268.19445037841797, "epoch": 0.8025236593059937, "grad_norm": 27.278356050728746, "kl": 63.755208333333336, "learning_rate": 4.97834712698025e-07, "loss": 0.3404, "reward": 0.9027778077870607, "reward_std": 0.6374689054985841, "rewards/equation_reward_func": 0.5208333469927311, "rewards/format_reward_func": 0.3819444514811039, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 261.0486183166504, "epoch": 0.8063091482649842, "grad_norm": 112.26698272573795, "kl": 125.875, "learning_rate": 4.977991230946823e-07, "loss": 0.3086, "reward": 0.9791666915019354, "reward_std": 0.6475708857178688, "rewards/equation_reward_func": 0.5763889091710249, "rewards/format_reward_func": 0.40277778916060925, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 304.7708435058594, "epoch": 0.8100946372239748, "grad_norm": 88.42317906709971, "kl": 145.625, "learning_rate": 4.977632446804992e-07, "loss": 0.3789, "reward": 0.784722238779068, "reward_std": 0.6482410331567129, "rewards/equation_reward_func": 0.451388909171025, "rewards/format_reward_func": 0.33333334388832253, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 276.3194529215495, "epoch": 0.8138801261829653, "grad_norm": 57.008405478973984, "kl": 90.27083333333333, "learning_rate": 4.97727077497292e-07, "loss": 0.3829, "reward": 0.8888889054457346, "reward_std": 0.58370058486859, "rewards/equation_reward_func": 0.5763889079292616, "rewards/format_reward_func": 0.3125000074505806, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 274.0277837117513, "epoch": 0.8176656151419558, "grad_norm": 41.9648702558559, "kl": 93.64973958333333, "learning_rate": 4.976906215872137e-07, "loss": 0.2295, "reward": 0.8263889054457346, "reward_std": 0.6093253418803215, "rewards/equation_reward_func": 0.4861111243565877, "rewards/format_reward_func": 0.34027778419355553, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 366.2569580078125, "epoch": 0.8214511041009463, "grad_norm": 23.70562238362353, "kl": 49.619791666666664, "learning_rate": 4.976538769927538e-07, "loss": 0.2481, "reward": 0.5763888992369175, "reward_std": 0.6349846472342809, "rewards/equation_reward_func": 0.3541666728754838, "rewards/format_reward_func": 0.2222222244987885, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 322.51389439900714, "epoch": 0.8252365930599369, "grad_norm": 173.94515224795077, "kl": 55.354166666666664, "learning_rate": 4.976168437567384e-07, "loss": 0.2866, "reward": 0.7361111330489317, "reward_std": 0.5293329904476801, "rewards/equation_reward_func": 0.4652777922650178, "rewards/format_reward_func": 0.2708333395421505, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 370.1041781107585, "epoch": 0.8290220820189275, "grad_norm": 49.42785489603644, "kl": 38.208333333333336, "learning_rate": 4.975795219223298e-07, "loss": 0.2725, "reward": 0.6250000161429247, "reward_std": 0.6038348153233528, "rewards/equation_reward_func": 0.39583334513008595, "rewards/format_reward_func": 0.2291666710128387, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 358.9305674235026, "epoch": 0.832807570977918, "grad_norm": 63.93647805172191, "kl": 38.565104166666664, "learning_rate": 4.975419115330267e-07, "loss": 0.2397, "reward": 0.6388889079292616, "reward_std": 0.5783760311702887, "rewards/equation_reward_func": 0.40972224312524, "rewards/format_reward_func": 0.2291666722546021, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 343.5486208597819, "epoch": 0.8365930599369085, "grad_norm": 18.35613118498554, "kl": 40.0, "learning_rate": 4.975040126326641e-07, "loss": 0.3459, "reward": 0.7291666766007742, "reward_std": 0.6189329201976458, "rewards/equation_reward_func": 0.46527778916060925, "rewards/format_reward_func": 0.2638888967533906, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 334.00001017252606, "epoch": 0.840378548895899, "grad_norm": 61.315133397385836, "kl": 64.54166666666667, "learning_rate": 4.974658252654134e-07, "loss": 0.3642, "reward": 0.6111111268401146, "reward_std": 0.6266890317201614, "rewards/equation_reward_func": 0.3402777823309104, "rewards/format_reward_func": 0.27083334140479565, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 287.41667556762695, "epoch": 0.8441640378548896, "grad_norm": 111.02963258009683, "kl": 73.3125, "learning_rate": 4.974273494757822e-07, "loss": 0.2892, "reward": 0.736111139257749, "reward_std": 0.5954531555374464, "rewards/equation_reward_func": 0.430555568387111, "rewards/format_reward_func": 0.3055555609365304, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 315.1458447774251, "epoch": 0.8479495268138801, "grad_norm": 416.3212915048579, "kl": 112.47135416666667, "learning_rate": 4.973885853086141e-07, "loss": 0.3557, "reward": 0.7083333532015482, "reward_std": 0.595863493780295, "rewards/equation_reward_func": 0.43750001055498916, "rewards/format_reward_func": 0.27083333892126876, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 282.88195419311523, "epoch": 0.8517350157728707, "grad_norm": 84.57887686537714, "kl": 97.375, "learning_rate": 4.973495328090889e-07, "loss": 0.4201, "reward": 0.5625000124176344, "reward_std": 0.6184229714175066, "rewards/equation_reward_func": 0.24305556466182074, "rewards/format_reward_func": 0.31944445334374905, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 340.87501271565753, "epoch": 0.8555205047318613, "grad_norm": 139.9178717709, "kl": 92.39583333333333, "learning_rate": 4.973101920227225e-07, "loss": 0.3206, "reward": 0.5555555683871111, "reward_std": 0.6198337351282438, "rewards/equation_reward_func": 0.3263888955116272, "rewards/format_reward_func": 0.22916667287548384, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 351.2916768391927, "epoch": 0.8593059936908517, "grad_norm": 168.90078404576994, "kl": 58.34375, "learning_rate": 4.972705629953667e-07, "loss": 0.3032, "reward": 0.7083333482344946, "reward_std": 0.6670572757720947, "rewards/equation_reward_func": 0.395833349476258, "rewards/format_reward_func": 0.3125000074505806, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 346.7777913411458, "epoch": 0.8630914826498423, "grad_norm": 53.736984247481196, "kl": 71.42708333333333, "learning_rate": 4.97230645773209e-07, "loss": 0.3515, "reward": 0.6180555665244659, "reward_std": 0.5822310447692871, "rewards/equation_reward_func": 0.3680555621782939, "rewards/format_reward_func": 0.25000000682969886, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 329.21528879801434, "epoch": 0.8668769716088328, "grad_norm": 61.81724196878047, "kl": 71.97395833333333, "learning_rate": 4.971904404027736e-07, "loss": 0.3712, "reward": 0.5972222362955412, "reward_std": 0.6221836258967718, "rewards/equation_reward_func": 0.34722223319113255, "rewards/format_reward_func": 0.2500000062088172, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 400.7083511352539, "epoch": 0.8706624605678234, "grad_norm": 89.56376909680318, "kl": 93.92708333333333, "learning_rate": 4.971499469309197e-07, "loss": 0.3209, "reward": 0.5486111330489317, "reward_std": 0.5003731027245522, "rewards/equation_reward_func": 0.3611111169060071, "rewards/format_reward_func": 0.18750000558793545, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 379.7430674235026, "epoch": 0.8744479495268139, "grad_norm": 96.80934872874563, "kl": 71.47916666666667, "learning_rate": 4.971091654048427e-07, "loss": 0.2863, "reward": 0.4166666828095913, "reward_std": 0.5312095309297243, "rewards/equation_reward_func": 0.26388889489074546, "rewards/format_reward_func": 0.15277778233091036, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 379.1250063578288, "epoch": 0.8782334384858044, "grad_norm": 313.3277909671654, "kl": 157.80208333333334, "learning_rate": 4.970680958720733e-07, "loss": 0.5211, "reward": 0.48611112746099633, "reward_std": 0.5439305094381174, "rewards/equation_reward_func": 0.2222222313284874, "rewards/format_reward_func": 0.2638888992369175, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 365.0416742960612, "epoch": 0.8820189274447949, "grad_norm": 121.11532507159346, "kl": 131.42708333333334, "learning_rate": 4.970267383804787e-07, "loss": 0.4011, "reward": 0.4375000149011612, "reward_std": 0.5364614203572273, "rewards/equation_reward_func": 0.28472223070760566, "rewards/format_reward_func": 0.1527777792265018, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 354.7916742960612, "epoch": 0.8858044164037855, "grad_norm": 90.17803998443027, "kl": 152.9375, "learning_rate": 4.96985092978261e-07, "loss": 0.4152, "reward": 0.4305555696288745, "reward_std": 0.5253821363051733, "rewards/equation_reward_func": 0.2847222325702508, "rewards/format_reward_func": 0.14583333457509676, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 281.70834223429364, "epoch": 0.889589905362776, "grad_norm": 90.52105948028516, "kl": 115.625, "learning_rate": 4.969431597139581e-07, "loss": 0.2493, "reward": 0.5000000074505806, "reward_std": 0.6266848891973495, "rewards/equation_reward_func": 0.3472222338120143, "rewards/format_reward_func": 0.15277778171002865, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 323.0347315470378, "epoch": 0.8933753943217666, "grad_norm": 113.45003802315175, "kl": 83.2734375, "learning_rate": 4.969009386364433e-07, "loss": 0.3054, "reward": 0.4861111131807168, "reward_std": 0.581800473233064, "rewards/equation_reward_func": 0.2986111218730609, "rewards/format_reward_func": 0.1875000068296989, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 427.35418065388996, "epoch": 0.897160883280757, "grad_norm": 28.936753072783624, "kl": 65.74479166666667, "learning_rate": 4.968584297949254e-07, "loss": 0.2886, "reward": 0.4305555659035842, "reward_std": 0.5503566016753515, "rewards/equation_reward_func": 0.3194444527228673, "rewards/format_reward_func": 0.11111111442248027, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 356.95834096272785, "epoch": 0.9009463722397476, "grad_norm": 53.210272136279166, "kl": 67.9296875, "learning_rate": 4.968156332389489e-07, "loss": 0.2718, "reward": 0.652777798473835, "reward_std": 0.6074397390087446, "rewards/equation_reward_func": 0.44444445582727593, "rewards/format_reward_func": 0.20833333830038706, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 345.06250254313153, "epoch": 0.9047318611987382, "grad_norm": 68.31437143998066, "kl": 26.390625, "learning_rate": 4.967725490183929e-07, "loss": 0.2034, "reward": 0.5625000111758709, "reward_std": 0.6406622032324473, "rewards/equation_reward_func": 0.35416667970518273, "rewards/format_reward_func": 0.2083333389212688, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 384.29168192545575, "epoch": 0.9085173501577287, "grad_norm": 46.566871330301204, "kl": 38.3125, "learning_rate": 4.967291771834726e-07, "loss": 0.2743, "reward": 0.5138889116545519, "reward_std": 0.6012993454933167, "rewards/equation_reward_func": 0.3472222375373046, "rewards/format_reward_func": 0.1666666685293118, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 403.1458435058594, "epoch": 0.9123028391167193, "grad_norm": 52.966354403482825, "kl": 58.018229166666664, "learning_rate": 4.96685517784738e-07, "loss": 0.1692, "reward": 0.5555555783212185, "reward_std": 0.5279722325503826, "rewards/equation_reward_func": 0.3819444576899211, "rewards/format_reward_func": 0.17361111318071684, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 415.2847328186035, "epoch": 0.9160883280757098, "grad_norm": 76.22102872372605, "kl": 36.552083333333336, "learning_rate": 4.966415708730742e-07, "loss": 0.2723, "reward": 0.4930555745959282, "reward_std": 0.5246221944689751, "rewards/equation_reward_func": 0.31944445210198563, "rewards/format_reward_func": 0.173611115043362, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 414.2291793823242, "epoch": 0.9198738170347003, "grad_norm": 126.65545998025205, "kl": 60.572916666666664, "learning_rate": 4.965973364997015e-07, "loss": 0.2943, "reward": 0.5138889017204443, "reward_std": 0.6207031682133675, "rewards/equation_reward_func": 0.3402777835726738, "rewards/format_reward_func": 0.1736111156642437, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 448.9166768391927, "epoch": 0.9236593059936908, "grad_norm": 39.174744576485224, "kl": 56.713541666666664, "learning_rate": 4.965528147161752e-07, "loss": 0.2663, "reward": 0.46527779412766296, "reward_std": 0.4942639557023843, "rewards/equation_reward_func": 0.30555556776622933, "rewards/format_reward_func": 0.1597222244987885, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 486.6319580078125, "epoch": 0.9274447949526814, "grad_norm": 48.44066729068605, "kl": 102.69791666666667, "learning_rate": 4.965080055743858e-07, "loss": 0.2164, "reward": 0.36805556279917556, "reward_std": 0.503364427636067, "rewards/equation_reward_func": 0.22916667411724725, "rewards/format_reward_func": 0.1388888917863369, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 491.8541844685872, "epoch": 0.931230283911672, "grad_norm": 46.336260606492786, "kl": 81.82291666666667, "learning_rate": 4.964629091265583e-07, "loss": 0.2553, "reward": 0.36805556900799274, "reward_std": 0.39493420471747714, "rewards/equation_reward_func": 0.2430555603156487, "rewards/format_reward_func": 0.1250000031044086, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 470.3472315470378, "epoch": 0.9350157728706625, "grad_norm": 117.64457418851589, "kl": 107.79166666666667, "learning_rate": 4.964175254252529e-07, "loss": 0.2875, "reward": 0.29166667473812896, "reward_std": 0.40408586089809734, "rewards/equation_reward_func": 0.1944444508602222, "rewards/format_reward_func": 0.09722222449878852, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 440.8472391764323, "epoch": 0.938801261829653, "grad_norm": 122.35688787505664, "kl": 80.23958333333333, "learning_rate": 4.963718545233644e-07, "loss": 0.2675, "reward": 0.2916666815678279, "reward_std": 0.4292173832654953, "rewards/equation_reward_func": 0.1527777804682652, "rewards/format_reward_func": 0.1388888917863369, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 451.4514045715332, "epoch": 0.9425867507886435, "grad_norm": 61.21324750008328, "kl": 64.75, "learning_rate": 4.963258964741226e-07, "loss": 0.3291, "reward": 0.3819444589316845, "reward_std": 0.4863445957501729, "rewards/equation_reward_func": 0.26388889489074546, "rewards/format_reward_func": 0.11805555845300357, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 448.3680674235026, "epoch": 0.9463722397476341, "grad_norm": 76.04318455804953, "kl": 56.770833333333336, "learning_rate": 4.962796513310916e-07, "loss": 0.2302, "reward": 0.3333333383003871, "reward_std": 0.4893345981836319, "rewards/equation_reward_func": 0.2222222276031971, "rewards/format_reward_func": 0.11111111318071683, "step": 500 } ], "logging_steps": 2, "max_steps": 6000, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }