{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.12234910277324633, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 479.9791717529297, "epoch": 0.0004078303425774878, "grad_norm": 3.6416013248561003, "kl": 0.0, "learning_rate": 6.666666666666667e-08, "loss": 0.0, "reward": 0.14017362147569656, "reward_std": 0.433171808719635, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.09850695356726646, "step": 2 }, { "completion_length": 370.3333435058594, "epoch": 0.0008156606851549756, "grad_norm": 2.7919516711454913, "kl": 0.00021123886108398438, "learning_rate": 1.3333333333333334e-07, "loss": 0.0, "reward": 0.36586807668209076, "reward_std": 0.5816957801580429, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.32420141994953156, "step": 4 }, { "completion_length": 381.8958435058594, "epoch": 0.0012234910277324632, "grad_norm": 2.1831646783127723, "kl": 0.00020503997802734375, "learning_rate": 2e-07, "loss": 0.0, "reward": 0.19583334028720856, "reward_std": 0.49165327847003937, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 0.19583334028720856, "step": 6 }, { "completion_length": 370.5625, "epoch": 0.0016313213703099511, "grad_norm": 2.7704350046014077, "kl": 0.00017118453979492188, "learning_rate": 2.6666666666666667e-07, "loss": 0.0, "reward": 0.36277779191732407, "reward_std": 0.5977305769920349, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 0.3627777770161629, "step": 8 }, { "completion_length": 463.37501525878906, "epoch": 0.0020391517128874386, "grad_norm": 2.123991976170542, "kl": 0.00017118453979492188, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "reward": 0.29836806654930115, "reward_std": 0.6623781323432922, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.25670139491558075, "step": 10 }, { "completion_length": 421.93751525878906, "epoch": 0.0024469820554649264, "grad_norm": 2.878424655232733, "kl": 0.00017547607421875, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.3968055695295334, "reward_std": 0.63937908411026, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 0.3968055695295334, "step": 12 }, { "completion_length": 475.3541717529297, "epoch": 0.0028548123980424145, "grad_norm": 2.4896630378314084, "kl": 0.00018596649169921875, "learning_rate": 4.6666666666666666e-07, "loss": 0.0, "reward": 0.24850694835186005, "reward_std": 0.5338329374790192, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 0.24850695580244064, "step": 14 }, { "completion_length": 416.6666717529297, "epoch": 0.0032626427406199023, "grad_norm": 2.09352394131462, "kl": 0.0005130767822265625, "learning_rate": 5.333333333333333e-07, "loss": 0.0, "reward": 0.33868058025836945, "reward_std": 0.6486604511737823, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.29701392352581024, "step": 16 }, { "completion_length": 415.00001525878906, "epoch": 0.00367047308319739, "grad_norm": 2.0112836386100708, "kl": 0.0008068084716796875, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.25767362117767334, "reward_std": 0.5489525943994522, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.21600694954395294, "step": 18 }, { "completion_length": 297.3958435058594, "epoch": 0.004078303425774877, "grad_norm": 2.8663136385547197, "kl": 0.001346588134765625, "learning_rate": 6.666666666666666e-07, "loss": 0.0, "reward": 0.5803819894790649, "reward_std": 0.7896733283996582, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 0.45538195967674255, "step": 20 }, { "completion_length": 399.54168701171875, "epoch": 0.004486133768352365, "grad_norm": 2.697624014237005, "kl": 0.00267791748046875, "learning_rate": 7.333333333333332e-07, "loss": 0.0, "reward": 0.4483680725097656, "reward_std": 0.5992304682731628, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 0.44836805760860443, "step": 22 }, { "completion_length": 284.75001525878906, "epoch": 0.004893964110929853, "grad_norm": 2.5548177263979235, "kl": 0.0041961669921875, "learning_rate": 8e-07, "loss": 0.0, "reward": 0.5757291615009308, "reward_std": 0.647213488817215, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 0.5757291913032532, "step": 24 }, { "completion_length": 383.0833435058594, "epoch": 0.005301794453507341, "grad_norm": 2.5987870166821843, "kl": 0.00470733642578125, "learning_rate": 8.666666666666667e-07, "loss": 0.0, "reward": 0.552534744143486, "reward_std": 0.7489242553710938, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.5108680874109268, "step": 26 }, { "completion_length": 310.5, "epoch": 0.005709624796084829, "grad_norm": 2.3713758660284148, "kl": 0.006378173828125, "learning_rate": 9.333333333333333e-07, "loss": 0.0, "reward": 0.8682639002799988, "reward_std": 0.9391801357269287, "rewards/equation_reward_func": 0.2083333432674408, "rewards/format_reward_func": 0.6599305868148804, "step": 28 }, { "completion_length": 259.1666717529297, "epoch": 0.006117455138662317, "grad_norm": 1.9481870816169569, "kl": 0.0147705078125, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.7582292258739471, "reward_std": 0.7113883793354034, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.674895852804184, "step": 30 }, { "completion_length": 294.66668701171875, "epoch": 0.0065252854812398045, "grad_norm": 2.8753021397818284, "kl": 0.010467529296875, "learning_rate": 9.999696229471714e-07, "loss": 0.0, "reward": 0.931770920753479, "reward_std": 0.7402721643447876, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 0.890104204416275, "step": 32 }, { "completion_length": 190.70834350585938, "epoch": 0.006933115823817292, "grad_norm": 3.149911745984124, "kl": 0.0218505859375, "learning_rate": 9.998784954797472e-07, "loss": 0.0, "reward": 1.0088889300823212, "reward_std": 0.7540942430496216, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 0.9255555272102356, "step": 34 }, { "completion_length": 246.4166717529297, "epoch": 0.00734094616639478, "grad_norm": 3.153714167577427, "kl": 0.01788330078125, "learning_rate": 9.99726628670463e-07, "loss": 0.0, "reward": 1.202048659324646, "reward_std": 1.045266568660736, "rewards/equation_reward_func": 0.2500000074505806, "rewards/format_reward_func": 0.9520486295223236, "step": 36 }, { "completion_length": 275.31251525878906, "epoch": 0.007748776508972268, "grad_norm": 0.9481466466680466, "kl": 0.013427734375, "learning_rate": 9.995140409723828e-07, "loss": 0.0, "reward": 1.1351736187934875, "reward_std": 0.5596802830696106, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.1351736187934875, "step": 38 }, { "completion_length": 188.5416717529297, "epoch": 0.008156606851549755, "grad_norm": 1.4742456865490186, "kl": 0.02484130859375, "learning_rate": 9.99240758216658e-07, "loss": 0.0, "reward": 1.0687847137451172, "reward_std": 0.6074499785900116, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 1.0271181166172028, "step": 40 }, { "completion_length": 199.70833587646484, "epoch": 0.008564437194127243, "grad_norm": 1.915063750564903, "kl": 0.032958984375, "learning_rate": 9.989068136093872e-07, "loss": 0.0, "reward": 1.1395833194255829, "reward_std": 0.5386238098144531, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 1.0979166626930237, "step": 42 }, { "completion_length": 267.8333435058594, "epoch": 0.00897226753670473, "grad_norm": 1.3838672018586455, "kl": 0.02752685546875, "learning_rate": 9.985122477275824e-07, "loss": 0.0, "reward": 1.1198958158493042, "reward_std": 0.4947269856929779, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.119895875453949, "step": 44 }, { "completion_length": 198.3541717529297, "epoch": 0.009380097879282219, "grad_norm": 2.65001960202523, "kl": 0.0621337890625, "learning_rate": 9.98057108514238e-07, "loss": 0.0001, "reward": 1.3591667413711548, "reward_std": 0.5990243405103683, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 1.1924999952316284, "step": 46 }, { "completion_length": 180.14583587646484, "epoch": 0.009787928221859706, "grad_norm": 2.273310138239933, "kl": 0.02679443359375, "learning_rate": 9.975414512725056e-07, "loss": 0.0, "reward": 1.4479514360427856, "reward_std": 0.5268709659576416, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 1.281284749507904, "step": 48 }, { "completion_length": 297.7291793823242, "epoch": 0.010195758564437194, "grad_norm": 1.5712495679827763, "kl": 0.02593994140625, "learning_rate": 9.969653386589747e-07, "loss": 0.0, "reward": 1.27177095413208, "reward_std": 0.6252816617488861, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 1.2301042079925537, "step": 50 }, { "completion_length": 194.5416717529297, "epoch": 0.010603588907014683, "grad_norm": 2.201493729261743, "kl": 0.03179931640625, "learning_rate": 9.963288406760582e-07, "loss": 0.0, "reward": 1.5546875, "reward_std": 0.7862544655799866, "rewards/equation_reward_func": 0.25, "rewards/format_reward_func": 1.3046875596046448, "step": 52 }, { "completion_length": 294.8958435058594, "epoch": 0.01101141924959217, "grad_norm": 1.093662160894823, "kl": 0.0328369140625, "learning_rate": 9.956320346634875e-07, "loss": 0.0, "reward": 1.2887500524520874, "reward_std": 0.49396970868110657, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.2887500524520874, "step": 54 }, { "completion_length": 300.1458435058594, "epoch": 0.011419249592169658, "grad_norm": 0.8578612513997826, "kl": 0.03271484375, "learning_rate": 9.94875005288915e-07, "loss": 0.0, "reward": 1.1602779626846313, "reward_std": 0.5292092859745026, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.1602778434753418, "step": 56 }, { "completion_length": 299.7291717529297, "epoch": 0.011827079934747145, "grad_norm": 1.6716454246814316, "kl": 0.032470703125, "learning_rate": 9.940578445376257e-07, "loss": 0.0, "reward": 1.3559028506278992, "reward_std": 0.7161896526813507, "rewards/equation_reward_func": 0.1250000037252903, "rewards/format_reward_func": 1.2309028506278992, "step": 58 }, { "completion_length": 366.50001525878906, "epoch": 0.012234910277324634, "grad_norm": 1.2542596393593248, "kl": 0.026611328125, "learning_rate": 9.931806517013612e-07, "loss": 0.0, "reward": 1.2951388955116272, "reward_std": 0.6559239327907562, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.2951388955116272, "step": 60 }, { "completion_length": 267.37501525878906, "epoch": 0.01264274061990212, "grad_norm": 1.951739626954709, "kl": 0.03289794921875, "learning_rate": 9.922435333662535e-07, "loss": 0.0, "reward": 1.624826431274414, "reward_std": 0.869500607252121, "rewards/equation_reward_func": 0.2916666716337204, "rewards/format_reward_func": 1.3331597447395325, "step": 62 }, { "completion_length": 252.1041717529297, "epoch": 0.013050570962479609, "grad_norm": 1.7631361729865684, "kl": 0.0838623046875, "learning_rate": 9.912466033998757e-07, "loss": 0.0001, "reward": 1.5268749594688416, "reward_std": 0.7954416573047638, "rewards/equation_reward_func": 0.2916666679084301, "rewards/format_reward_func": 1.2352083921432495, "step": 64 }, { "completion_length": 266.2291717529297, "epoch": 0.013458401305057096, "grad_norm": 1.086442325424048, "kl": 0.0350341796875, "learning_rate": 9.901899829374047e-07, "loss": 0.0, "reward": 1.3218055367469788, "reward_std": 0.43308839201927185, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.3218055367469788, "step": 66 }, { "completion_length": 270.62501525878906, "epoch": 0.013866231647634585, "grad_norm": 0.7757080705779537, "kl": 0.03179931640625, "learning_rate": 9.890738003669027e-07, "loss": 0.0, "reward": 1.440381944179535, "reward_std": 0.36590851843357086, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.440381944179535, "step": 68 }, { "completion_length": 361.7291717529297, "epoch": 0.014274061990212071, "grad_norm": 0.9196489537714668, "kl": 0.105224609375, "learning_rate": 9.878981913137177e-07, "loss": 0.0001, "reward": 1.234375, "reward_std": 0.5996429324150085, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.2343750596046448, "step": 70 }, { "completion_length": 324.6041717529297, "epoch": 0.01468189233278956, "grad_norm": 1.3537086792376973, "kl": 0.037841796875, "learning_rate": 9.866632986240029e-07, "loss": 0.0, "reward": 1.4478819966316223, "reward_std": 0.447622686624527, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.4478819966316223, "step": 72 }, { "completion_length": 288.00001525878906, "epoch": 0.015089722675367047, "grad_norm": 0.9416399145929761, "kl": 0.0350341796875, "learning_rate": 9.853692723473598e-07, "loss": 0.0, "reward": 1.6369444727897644, "reward_std": 0.5435648560523987, "rewards/equation_reward_func": 0.125, "rewards/format_reward_func": 1.5119444727897644, "step": 74 }, { "completion_length": 324.8958435058594, "epoch": 0.015497553017944535, "grad_norm": 1.4608277452795702, "kl": 0.0404052734375, "learning_rate": 9.840162697186074e-07, "loss": 0.0, "reward": 1.5702083706855774, "reward_std": 0.8935641050338745, "rewards/equation_reward_func": 0.3333333432674408, "rewards/format_reward_func": 1.236875057220459, "step": 76 }, { "completion_length": 290.56250762939453, "epoch": 0.015905383360522024, "grad_norm": 0.7407367075365319, "kl": 0.0458984375, "learning_rate": 9.826044551386742e-07, "loss": 0.0, "reward": 1.5788541436195374, "reward_std": 0.5843808948993683, "rewards/equation_reward_func": 0.125, "rewards/format_reward_func": 1.4538542032241821, "step": 78 }, { "completion_length": 362.68751525878906, "epoch": 0.01631321370309951, "grad_norm": 1.5661374163452022, "kl": 0.0408935546875, "learning_rate": 9.811340001546251e-07, "loss": 0.0, "reward": 1.6298264265060425, "reward_std": 0.8494586944580078, "rewards/equation_reward_func": 0.25, "rewards/format_reward_func": 1.3798264265060425, "step": 80 }, { "completion_length": 451.125, "epoch": 0.016721044045676998, "grad_norm": 1.0244455960613406, "kl": 0.0401611328125, "learning_rate": 9.79605083438815e-07, "loss": 0.0, "reward": 1.2677431106567383, "reward_std": 0.6154287457466125, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.2677430510520935, "step": 82 }, { "completion_length": 308.6458435058594, "epoch": 0.017128874388254486, "grad_norm": 1.3216564030922435, "kl": 0.0445556640625, "learning_rate": 9.780178907671788e-07, "loss": 0.0, "reward": 1.554166853427887, "reward_std": 0.5582673996686935, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 1.5125000476837158, "step": 84 }, { "completion_length": 374.60418701171875, "epoch": 0.017536704730831975, "grad_norm": 1.4468753547015691, "kl": 0.0377197265625, "learning_rate": 9.763726149966595e-07, "loss": 0.0, "reward": 1.77156263589859, "reward_std": 0.9427327811717987, "rewards/equation_reward_func": 0.3750000111758709, "rewards/format_reward_func": 1.3965625166893005, "step": 86 }, { "completion_length": 404.06251525878906, "epoch": 0.01794453507340946, "grad_norm": 1.3417802102005347, "kl": 0.0430908203125, "learning_rate": 9.74669456041773e-07, "loss": 0.0, "reward": 1.4073264598846436, "reward_std": 0.6282331496477127, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 1.365659773349762, "step": 88 }, { "completion_length": 442.625, "epoch": 0.01835236541598695, "grad_norm": 1.5147263647387277, "kl": 0.0496826171875, "learning_rate": 9.729086208503173e-07, "loss": 0.0, "reward": 1.4621528387069702, "reward_std": 0.8307860195636749, "rewards/equation_reward_func": 0.125, "rewards/format_reward_func": 1.3371528387069702, "step": 90 }, { "completion_length": 333.3541793823242, "epoch": 0.018760195758564437, "grad_norm": 1.754991435054013, "kl": 0.049560546875, "learning_rate": 9.710903233782272e-07, "loss": 0.0, "reward": 1.9437847137451172, "reward_std": 0.8394620716571808, "rewards/equation_reward_func": 0.5, "rewards/format_reward_func": 1.4437847137451172, "step": 92 }, { "completion_length": 381.4583435058594, "epoch": 0.019168026101141926, "grad_norm": 0.8810621605534877, "kl": 0.048828125, "learning_rate": 9.69214784563576e-07, "loss": 0.0, "reward": 1.7743055820465088, "reward_std": 0.7846577763557434, "rewards/equation_reward_func": 0.25, "rewards/format_reward_func": 1.5243056416511536, "step": 94 }, { "completion_length": 413.3958435058594, "epoch": 0.01957585644371941, "grad_norm": 1.1981506775594148, "kl": 0.0509033203125, "learning_rate": 9.672822322997304e-07, "loss": 0.0001, "reward": 1.5128472447395325, "reward_std": 0.724719375371933, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 1.429513931274414, "step": 96 }, { "completion_length": 408.68751525878906, "epoch": 0.0199836867862969, "grad_norm": 0.8127416399915954, "kl": 0.0482177734375, "learning_rate": 9.652929014076592e-07, "loss": 0.0, "reward": 1.4740972518920898, "reward_std": 0.581254854798317, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.4740972518920898, "step": 98 }, { "completion_length": 401.125, "epoch": 0.020391517128874388, "grad_norm": 0.9832509648546535, "kl": 0.04248046875, "learning_rate": 9.632470336074007e-07, "loss": 0.0, "reward": 1.5040277242660522, "reward_std": 0.6817552745342255, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 1.4623610973358154, "step": 100 }, { "completion_length": 434.9583435058594, "epoch": 0.020799347471451877, "grad_norm": 1.430611768368398, "kl": 0.0458984375, "learning_rate": 9.611448774886923e-07, "loss": 0.0, "reward": 1.833784818649292, "reward_std": 0.832920491695404, "rewards/equation_reward_func": 0.25, "rewards/format_reward_func": 1.5837848782539368, "step": 102 }, { "completion_length": 368.8958435058594, "epoch": 0.021207177814029365, "grad_norm": 1.681849348882777, "kl": 0.048828125, "learning_rate": 9.589866884807634e-07, "loss": 0.0, "reward": 2.02239590883255, "reward_std": 1.069144368171692, "rewards/equation_reward_func": 0.5416666865348816, "rewards/format_reward_func": 1.4807292222976685, "step": 104 }, { "completion_length": 417.60418701171875, "epoch": 0.02161500815660685, "grad_norm": 1.5460934994217617, "kl": 0.0577392578125, "learning_rate": 9.567727288213004e-07, "loss": 0.0001, "reward": 1.7238194942474365, "reward_std": 0.8790097832679749, "rewards/equation_reward_func": 0.2916666865348816, "rewards/format_reward_func": 1.432152807712555, "step": 106 }, { "completion_length": 367.2916717529297, "epoch": 0.02202283849918434, "grad_norm": 0.8154580597171138, "kl": 0.079833984375, "learning_rate": 9.545032675245813e-07, "loss": 0.0001, "reward": 1.6206597089767456, "reward_std": 0.5355260521173477, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 1.578993022441864, "step": 108 }, { "completion_length": 378.4375, "epoch": 0.022430668841761828, "grad_norm": 0.8080191505191605, "kl": 2.8504638671875, "learning_rate": 9.521785803487888e-07, "loss": 0.0029, "reward": 1.6229513883590698, "reward_std": 0.5469937920570374, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 1.5396180748939514, "step": 110 }, { "completion_length": 353.7083435058594, "epoch": 0.022838499184339316, "grad_norm": 2.0675711092750553, "kl": 0.0582275390625, "learning_rate": 9.497989497625034e-07, "loss": 0.0001, "reward": 1.9050694108009338, "reward_std": 0.8356568217277527, "rewards/equation_reward_func": 0.3750000111758709, "rewards/format_reward_func": 1.5300694704055786, "step": 112 }, { "completion_length": 396.3958435058594, "epoch": 0.0232463295269168, "grad_norm": 1.7722424779735066, "kl": 0.0474853515625, "learning_rate": 9.473646649103817e-07, "loss": 0.0, "reward": 1.8715277910232544, "reward_std": 0.7635601460933685, "rewards/equation_reward_func": 0.2916666865348816, "rewards/format_reward_func": 1.579861044883728, "step": 114 }, { "completion_length": 392.25001525878906, "epoch": 0.02365415986949429, "grad_norm": 1.4500449746136268, "kl": 0.052490234375, "learning_rate": 9.448760215780216e-07, "loss": 0.0001, "reward": 1.8360764980316162, "reward_std": 0.8432624340057373, "rewards/equation_reward_func": 0.3333333432674408, "rewards/format_reward_func": 1.502743124961853, "step": 116 }, { "completion_length": 445.37501525878906, "epoch": 0.02406199021207178, "grad_norm": 1.0245921035180448, "kl": 0.046142578125, "learning_rate": 9.423333221560229e-07, "loss": 0.0, "reward": 1.8455902934074402, "reward_std": 0.6104674339294434, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 1.8039236664772034, "step": 118 }, { "completion_length": 350.00001525878906, "epoch": 0.024469820554649267, "grad_norm": 1.0559535296932554, "kl": 0.0653076171875, "learning_rate": 9.397368756032444e-07, "loss": 0.0001, "reward": 2.0223612189292908, "reward_std": 0.8122723698616028, "rewards/equation_reward_func": 0.4166666679084301, "rewards/format_reward_func": 1.6056944131851196, "step": 120 }, { "completion_length": 430.6041717529297, "epoch": 0.024877650897226752, "grad_norm": 1.2128128464325674, "kl": 0.052978515625, "learning_rate": 9.370869974092628e-07, "loss": 0.0001, "reward": 1.6623265147209167, "reward_std": 0.6956472098827362, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 1.5789931416511536, "step": 122 }, { "completion_length": 430.62501525878906, "epoch": 0.02528548123980424, "grad_norm": 1.3439878279377437, "kl": 0.0567626953125, "learning_rate": 9.343840095560371e-07, "loss": 0.0001, "reward": 1.6742013692855835, "reward_std": 0.8052680194377899, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 1.5075347423553467, "step": 124 }, { "completion_length": 450.1666717529297, "epoch": 0.02569331158238173, "grad_norm": 1.6330050200150412, "kl": 0.05224609375, "learning_rate": 9.316282404787869e-07, "loss": 0.0001, "reward": 1.7786458730697632, "reward_std": 0.5853727161884308, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 1.7369791865348816, "step": 126 }, { "completion_length": 489.02085876464844, "epoch": 0.026101141924959218, "grad_norm": 0.7871177276009865, "kl": 0.051025390625, "learning_rate": 9.288200250260834e-07, "loss": 0.0001, "reward": 1.7109723091125488, "reward_std": 0.6242659687995911, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.7109723091125488, "step": 128 }, { "completion_length": 386.79168701171875, "epoch": 0.026508972267536703, "grad_norm": 1.4313328997618522, "kl": 0.07373046875, "learning_rate": 9.259597044191635e-07, "loss": 0.0001, "reward": 1.8375002145767212, "reward_std": 0.8174974322319031, "rewards/equation_reward_func": 0.2916666865348816, "rewards/format_reward_func": 1.5458334684371948, "step": 130 }, { "completion_length": 444.9791717529297, "epoch": 0.026916802610114192, "grad_norm": 0.7975312553477618, "kl": 0.058837890625, "learning_rate": 9.230476262104676e-07, "loss": 0.0001, "reward": 1.977222204208374, "reward_std": 0.6751963198184967, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 1.8105555772781372, "step": 132 }, { "completion_length": 399.4791717529297, "epoch": 0.02732463295269168, "grad_norm": 1.57086996187115, "kl": 0.071044921875, "learning_rate": 9.200841442414105e-07, "loss": 0.0001, "reward": 1.7256250977516174, "reward_std": 0.631768524646759, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 1.6422916650772095, "step": 134 }, { "completion_length": 497.08335876464844, "epoch": 0.02773246329526917, "grad_norm": 0.8207193125582591, "kl": 0.078857421875, "learning_rate": 9.17069618599385e-07, "loss": 0.0001, "reward": 1.752673625946045, "reward_std": 0.6211968958377838, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.7526736855506897, "step": 136 }, { "completion_length": 456.29168701171875, "epoch": 0.028140293637846654, "grad_norm": 0.9050345890235396, "kl": 0.0628662109375, "learning_rate": 9.1400441557401e-07, "loss": 0.0001, "reward": 1.7863542437553406, "reward_std": 0.5832376182079315, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 1.7446874976158142, "step": 138 }, { "completion_length": 581.5416870117188, "epoch": 0.028548123980424143, "grad_norm": 0.8497382573737793, "kl": 0.060791015625, "learning_rate": 9.108889076126225e-07, "loss": 0.0001, "reward": 1.7704166769981384, "reward_std": 0.7596422731876373, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.7704167366027832, "step": 140 }, { "completion_length": 409.06251525878906, "epoch": 0.02895595432300163, "grad_norm": 1.4428479278815958, "kl": 0.0732421875, "learning_rate": 9.077234732750223e-07, "loss": 0.0001, "reward": 2.054965376853943, "reward_std": 0.8280318379402161, "rewards/equation_reward_func": 0.4166666865348816, "rewards/format_reward_func": 1.6382986307144165, "step": 142 }, { "completion_length": 399.1875, "epoch": 0.02936378466557912, "grad_norm": 0.850509849129193, "kl": 0.068115234375, "learning_rate": 9.045084971874737e-07, "loss": 0.0001, "reward": 2.358993172645569, "reward_std": 0.7760606110095978, "rewards/equation_reward_func": 0.625, "rewards/format_reward_func": 1.7339931726455688, "step": 144 }, { "completion_length": 384.2708435058594, "epoch": 0.029771615008156605, "grad_norm": 1.3099408456816737, "kl": 0.10888671875, "learning_rate": 9.012443699959704e-07, "loss": 0.0001, "reward": 2.7191320657730103, "reward_std": 0.9904708862304688, "rewards/equation_reward_func": 1.0, "rewards/format_reward_func": 1.7191320657730103, "step": 146 }, { "completion_length": 464.12501525878906, "epoch": 0.030179445350734094, "grad_norm": 0.9088601877007455, "kl": 0.08056640625, "learning_rate": 8.979314883187692e-07, "loss": 0.0001, "reward": 2.0606598258018494, "reward_std": 0.852357029914856, "rewards/equation_reward_func": 0.3333333432674408, "rewards/format_reward_func": 1.7273263931274414, "step": 148 }, { "completion_length": 566.8750305175781, "epoch": 0.030587275693311582, "grad_norm": 0.8778862901380006, "kl": 0.067626953125, "learning_rate": 8.945702546981968e-07, "loss": 0.0001, "reward": 1.779270887374878, "reward_std": 0.693590372800827, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.7792708277702332, "step": 150 }, { "completion_length": 429.3333435058594, "epoch": 0.03099510603588907, "grad_norm": 0.9577088802916648, "kl": 0.096923828125, "learning_rate": 8.911610775517382e-07, "loss": 0.0001, "reward": 1.8277431726455688, "reward_std": 0.6039248108863831, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 1.7860764265060425, "step": 152 }, { "completion_length": 565.4791870117188, "epoch": 0.031402936378466556, "grad_norm": 0.8371289059796367, "kl": 0.08740234375, "learning_rate": 8.877043711224107e-07, "loss": 0.0001, "reward": 1.9469445943832397, "reward_std": 0.5404301732778549, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.946944534778595, "step": 154 }, { "completion_length": 541.1666717529297, "epoch": 0.03181076672104405, "grad_norm": 0.7969331394521091, "kl": 0.079833984375, "learning_rate": 8.842005554284295e-07, "loss": 0.0001, "reward": 2.1353471875190735, "reward_std": 0.8812746703624725, "rewards/equation_reward_func": 0.2916666865348816, "rewards/format_reward_func": 1.8436806201934814, "step": 156 }, { "completion_length": 464.37501525878906, "epoch": 0.03221859706362153, "grad_norm": 1.351816458116067, "kl": 0.078857421875, "learning_rate": 8.806500562121722e-07, "loss": 0.0001, "reward": 2.5637847781181335, "reward_std": 0.8339135944843292, "rewards/equation_reward_func": 0.7083333544433117, "rewards/format_reward_func": 1.8554513454437256, "step": 158 }, { "completion_length": 629.9583435058594, "epoch": 0.03262642740619902, "grad_norm": 0.9040821275296973, "kl": 0.06396484375, "learning_rate": 8.77053304888448e-07, "loss": 0.0001, "reward": 1.7654513716697693, "reward_std": 0.9269569218158722, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 1.7237846851348877, "step": 160 }, { "completion_length": 637.7500305175781, "epoch": 0.03303425774877651, "grad_norm": 0.8146639791448892, "kl": 0.065673828125, "learning_rate": 8.734107384920769e-07, "loss": 0.0001, "reward": 1.7398958802223206, "reward_std": 0.8910411596298218, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.7398958802223206, "step": 162 }, { "completion_length": 553.3541870117188, "epoch": 0.033442088091353996, "grad_norm": 1.0794685996564912, "kl": 0.08251953125, "learning_rate": 8.69722799624786e-07, "loss": 0.0001, "reward": 2.068472385406494, "reward_std": 0.7307632863521576, "rewards/equation_reward_func": 0.125, "rewards/format_reward_func": 1.943472445011139, "step": 164 }, { "completion_length": 510.75, "epoch": 0.03384991843393149, "grad_norm": 1.5254552695681238, "kl": 0.084716796875, "learning_rate": 8.659899364014308e-07, "loss": 0.0001, "reward": 2.1311458945274353, "reward_std": 0.8883395195007324, "rewards/equation_reward_func": 0.3333333432674408, "rewards/format_reward_func": 1.7978126406669617, "step": 166 }, { "completion_length": 581.5416870117188, "epoch": 0.03425774877650897, "grad_norm": 0.9271897233542844, "kl": 0.07763671875, "learning_rate": 8.622126023955445e-07, "loss": 0.0001, "reward": 1.963923692703247, "reward_std": 0.6030838936567307, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.9639238119125366, "step": 168 }, { "completion_length": 555.3125152587891, "epoch": 0.03466557911908646, "grad_norm": 1.4561466805793857, "kl": 0.070556640625, "learning_rate": 8.583912565842256e-07, "loss": 0.0001, "reward": 2.237326502799988, "reward_std": 0.8882516920566559, "rewards/equation_reward_func": 0.5, "rewards/format_reward_func": 1.7373263835906982, "step": 170 }, { "completion_length": 421.2708435058594, "epoch": 0.03507340946166395, "grad_norm": 1.3044883746742693, "kl": 0.10107421875, "learning_rate": 8.545263632923686e-07, "loss": 0.0001, "reward": 2.6816667318344116, "reward_std": 1.1341252326965332, "rewards/equation_reward_func": 1.041666716337204, "rewards/format_reward_func": 1.64000004529953, "step": 172 }, { "completion_length": 592.6458435058594, "epoch": 0.035481239804241435, "grad_norm": 0.9961988228794976, "kl": 0.066162109375, "learning_rate": 8.506183921362442e-07, "loss": 0.0001, "reward": 2.093229293823242, "reward_std": 0.8178855180740356, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 1.9265625476837158, "step": 174 }, { "completion_length": 332.0208435058594, "epoch": 0.03588907014681892, "grad_norm": 1.5901752280491694, "kl": 0.099609375, "learning_rate": 8.466678179664377e-07, "loss": 0.0001, "reward": 2.9652082920074463, "reward_std": 0.8383155167102814, "rewards/equation_reward_func": 1.2500000596046448, "rewards/format_reward_func": 1.715208351612091, "step": 176 }, { "completion_length": 460.6666717529297, "epoch": 0.03629690048939641, "grad_norm": 1.6000004098847773, "kl": 0.16796875, "learning_rate": 8.426751208101499e-07, "loss": 0.0002, "reward": 2.6753125190734863, "reward_std": 1.0194191336631775, "rewards/equation_reward_func": 0.916666716337204, "rewards/format_reward_func": 1.7586458325386047, "step": 178 }, { "completion_length": 477.3125, "epoch": 0.0367047308319739, "grad_norm": 1.2239460420462749, "kl": 0.088623046875, "learning_rate": 8.386407858128706e-07, "loss": 0.0001, "reward": 2.590486168861389, "reward_std": 0.9470961093902588, "rewards/equation_reward_func": 0.75, "rewards/format_reward_func": 1.8404861688613892, "step": 180 }, { "completion_length": 444.5416717529297, "epoch": 0.03711256117455139, "grad_norm": 1.230296809506791, "kl": 0.10107421875, "learning_rate": 8.34565303179429e-07, "loss": 0.0001, "reward": 2.523506999015808, "reward_std": 0.9190675318241119, "rewards/equation_reward_func": 0.6666666865348816, "rewards/format_reward_func": 1.8568402528762817, "step": 182 }, { "completion_length": 556.7916870117188, "epoch": 0.037520391517128875, "grad_norm": 0.9132808347888518, "kl": 0.083984375, "learning_rate": 8.304491681144305e-07, "loss": 0.0001, "reward": 1.9321181178092957, "reward_std": 0.6165703535079956, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.9321181774139404, "step": 184 }, { "completion_length": 329.1666717529297, "epoch": 0.03792822185970636, "grad_norm": 1.5619424825885055, "kl": 0.118408203125, "learning_rate": 8.262928807620843e-07, "loss": 0.0001, "reward": 3.2064584493637085, "reward_std": 0.8171246647834778, "rewards/equation_reward_func": 1.5000000596046448, "rewards/format_reward_func": 1.706458330154419, "step": 186 }, { "completion_length": 388.1041717529297, "epoch": 0.03833605220228385, "grad_norm": 1.5560017569200078, "kl": 0.115478515625, "learning_rate": 8.220969461454321e-07, "loss": 0.0001, "reward": 2.5811806321144104, "reward_std": 0.5943560600280762, "rewards/equation_reward_func": 0.9166666865348816, "rewards/format_reward_func": 1.6645139455795288, "step": 188 }, { "completion_length": 502.33335876464844, "epoch": 0.03874388254486134, "grad_norm": 1.3187487188473963, "kl": 0.103515625, "learning_rate": 8.178618741049841e-07, "loss": 0.0001, "reward": 2.311215341091156, "reward_std": 0.7084816992282867, "rewards/equation_reward_func": 0.375, "rewards/format_reward_func": 1.936215341091156, "step": 190 }, { "completion_length": 347.29168701171875, "epoch": 0.03915171288743882, "grad_norm": 1.0778244344859724, "kl": 0.1083984375, "learning_rate": 8.135881792367685e-07, "loss": 0.0001, "reward": 2.41055566072464, "reward_std": 0.7850378751754761, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 1.66055566072464, "step": 192 }, { "completion_length": 335.9791717529297, "epoch": 0.039559543230016314, "grad_norm": 1.522034238839679, "kl": 0.105712890625, "learning_rate": 8.092763808298046e-07, "loss": 0.0001, "reward": 2.9850348234176636, "reward_std": 0.9680465757846832, "rewards/equation_reward_func": 1.375, "rewards/format_reward_func": 1.610034704208374, "step": 194 }, { "completion_length": 586.5625305175781, "epoch": 0.0399673735725938, "grad_norm": 0.6708954041029114, "kl": 0.10693359375, "learning_rate": 8.049270028030045e-07, "loss": 0.0001, "reward": 1.8110415935516357, "reward_std": 0.7432913780212402, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.8110417127609253, "step": 196 }, { "completion_length": 492.7916717529297, "epoch": 0.04037520391517129, "grad_norm": 1.1370661223441034, "kl": 0.087646484375, "learning_rate": 8.005405736415125e-07, "loss": 0.0001, "reward": 2.2512154579162598, "reward_std": 0.7727322578430176, "rewards/equation_reward_func": 0.2916666716337204, "rewards/format_reward_func": 1.9595486521720886, "step": 198 }, { "completion_length": 618.7708740234375, "epoch": 0.040783034257748776, "grad_norm": 0.9472446872281457, "kl": 0.081298828125, "learning_rate": 7.961176263324901e-07, "loss": 0.0001, "reward": 2.0697221755981445, "reward_std": 0.9369174838066101, "rewards/equation_reward_func": 0.2083333432674408, "rewards/format_reward_func": 1.861388921737671, "step": 200 }, { "completion_length": 485.1666717529297, "epoch": 0.04119086460032626, "grad_norm": 0.9224224522873015, "kl": 0.21337890625, "learning_rate": 7.916586983003533e-07, "loss": 0.0002, "reward": 2.622014045715332, "reward_std": 0.832764744758606, "rewards/equation_reward_func": 0.7916666865348816, "rewards/format_reward_func": 1.8303472995758057, "step": 202 }, { "completion_length": 498.3125, "epoch": 0.041598694942903754, "grad_norm": 0.8732846033594475, "kl": 0.09912109375, "learning_rate": 7.871643313414718e-07, "loss": 0.0001, "reward": 2.6251736879348755, "reward_std": 0.7892851531505585, "rewards/equation_reward_func": 0.875, "rewards/format_reward_func": 1.7501736879348755, "step": 204 }, { "completion_length": 436.4375, "epoch": 0.04200652528548124, "grad_norm": 1.182953041273801, "kl": 0.111328125, "learning_rate": 7.826350715583358e-07, "loss": 0.0001, "reward": 2.4944097995758057, "reward_std": 0.7472249865531921, "rewards/equation_reward_func": 0.7083333730697632, "rewards/format_reward_func": 1.7860764265060425, "step": 206 }, { "completion_length": 326.7083435058594, "epoch": 0.04241435562805873, "grad_norm": 1.6056713694544402, "kl": 0.117919921875, "learning_rate": 7.780714692932002e-07, "loss": 0.0001, "reward": 2.952360987663269, "reward_std": 0.8980874419212341, "rewards/equation_reward_func": 1.2500000596046448, "rewards/format_reward_func": 1.7023611664772034, "step": 208 }, { "completion_length": 470.50001525878906, "epoch": 0.042822185970636216, "grad_norm": 1.0414434379733573, "kl": 0.093994140625, "learning_rate": 7.734740790612136e-07, "loss": 0.0001, "reward": 2.6838542222976685, "reward_std": 1.0669545829296112, "rewards/equation_reward_func": 0.7916666865348816, "rewards/format_reward_func": 1.8921875357627869, "step": 210 }, { "completion_length": 577.0625305175781, "epoch": 0.0432300163132137, "grad_norm": 1.0292452997964692, "kl": 0.088134765625, "learning_rate": 7.688434594830391e-07, "loss": 0.0001, "reward": 1.7830208539962769, "reward_std": 0.7203674912452698, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.7830208539962769, "step": 212 }, { "completion_length": 443.1875, "epoch": 0.04363784665579119, "grad_norm": 1.1857098524166056, "kl": 0.169921875, "learning_rate": 7.641801732169795e-07, "loss": 0.0002, "reward": 2.93423593044281, "reward_std": 1.0427783131599426, "rewards/equation_reward_func": 1.125, "rewards/format_reward_func": 1.809236228466034, "step": 214 }, { "completion_length": 489.66668701171875, "epoch": 0.04404567699836868, "grad_norm": 0.8656675869839974, "kl": 0.13232421875, "learning_rate": 7.594847868906076e-07, "loss": 0.0001, "reward": 2.7152432203292847, "reward_std": 1.0624222159385681, "rewards/equation_reward_func": 1.1250000298023224, "rewards/format_reward_func": 1.59024316072464, "step": 216 }, { "completion_length": 396.4166717529297, "epoch": 0.04445350734094616, "grad_norm": 1.2559326528283787, "kl": 0.1279296875, "learning_rate": 7.547578710319174e-07, "loss": 0.0001, "reward": 3.0093750953674316, "reward_std": 0.960063099861145, "rewards/equation_reward_func": 1.2916666865348816, "rewards/format_reward_func": 1.71770840883255, "step": 218 }, { "completion_length": 444.39585876464844, "epoch": 0.044861337683523655, "grad_norm": 1.0899183864856226, "kl": 0.378173828125, "learning_rate": 7.5e-07, "loss": 0.0004, "reward": 2.7316668033599854, "reward_std": 0.6873580813407898, "rewards/equation_reward_func": 0.8333333358168602, "rewards/format_reward_func": 1.898333489894867, "step": 220 }, { "completion_length": 464.70835876464844, "epoch": 0.04526916802610114, "grad_norm": 0.8216128962927133, "kl": 0.115966796875, "learning_rate": 7.452117519152541e-07, "loss": 0.0001, "reward": 2.5972570180892944, "reward_std": 0.7697752714157104, "rewards/equation_reward_func": 0.75, "rewards/format_reward_func": 1.8472568988800049, "step": 222 }, { "completion_length": 531.8958587646484, "epoch": 0.04567699836867863, "grad_norm": 0.8041197621718661, "kl": 0.144775390625, "learning_rate": 7.403937085891397e-07, "loss": 0.0001, "reward": 2.5897916555404663, "reward_std": 0.7150984704494476, "rewards/equation_reward_func": 0.9166666865348816, "rewards/format_reward_func": 1.6731250286102295, "step": 224 }, { "completion_length": 647.2500305175781, "epoch": 0.04608482871125612, "grad_norm": 1.1739617017348787, "kl": 0.093017578125, "learning_rate": 7.355464554534836e-07, "loss": 0.0001, "reward": 2.5721182823181152, "reward_std": 1.2919026017189026, "rewards/equation_reward_func": 0.7083333432674408, "rewards/format_reward_func": 1.8637848496437073, "step": 226 }, { "completion_length": 618.6041870117188, "epoch": 0.0464926590538336, "grad_norm": 0.786794707945746, "kl": 0.091064453125, "learning_rate": 7.306705814893439e-07, "loss": 0.0001, "reward": 2.3971527814865112, "reward_std": 0.9052496254444122, "rewards/equation_reward_func": 0.5, "rewards/format_reward_func": 1.8971527814865112, "step": 228 }, { "completion_length": 378.6041717529297, "epoch": 0.046900489396411095, "grad_norm": 1.1431295777314772, "kl": 0.14111328125, "learning_rate": 7.257666791554447e-07, "loss": 0.0001, "reward": 3.0398958921432495, "reward_std": 0.7211508750915527, "rewards/equation_reward_func": 1.25, "rewards/format_reward_func": 1.7898958921432495, "step": 230 }, { "completion_length": 390.97918701171875, "epoch": 0.04730831973898858, "grad_norm": 1.3315143130569003, "kl": 0.1064453125, "learning_rate": 7.20835344316187e-07, "loss": 0.0001, "reward": 3.165451407432556, "reward_std": 0.8909177780151367, "rewards/equation_reward_func": 1.375, "rewards/format_reward_func": 1.7904514074325562, "step": 232 }, { "completion_length": 535.4166870117188, "epoch": 0.047716150081566065, "grad_norm": 0.8301567265751203, "kl": 0.10302734375, "learning_rate": 7.158771761692464e-07, "loss": 0.0001, "reward": 2.5394792556762695, "reward_std": 0.8184142112731934, "rewards/equation_reward_func": 0.75, "rewards/format_reward_func": 1.7894791960716248, "step": 234 }, { "completion_length": 569.0833435058594, "epoch": 0.04812398042414356, "grad_norm": 0.7756037952607522, "kl": 0.11376953125, "learning_rate": 7.108927771727661e-07, "loss": 0.0001, "reward": 2.432673692703247, "reward_std": 0.8917776942253113, "rewards/equation_reward_func": 0.5416666865348816, "rewards/format_reward_func": 1.8910069465637207, "step": 236 }, { "completion_length": 668.8125305175781, "epoch": 0.04853181076672104, "grad_norm": 1.0059181715098997, "kl": 0.084228515625, "learning_rate": 7.058827529721525e-07, "loss": 0.0001, "reward": 1.8420140147209167, "reward_std": 0.8704274594783783, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.842013955116272, "step": 238 }, { "completion_length": 530.625, "epoch": 0.048939641109298535, "grad_norm": 0.9400020844643003, "kl": 0.095458984375, "learning_rate": 7.008477123264847e-07, "loss": 0.0001, "reward": 2.611979365348816, "reward_std": 0.9893729388713837, "rewards/equation_reward_func": 0.6250000298023224, "rewards/format_reward_func": 1.9869792461395264, "step": 240 }, { "completion_length": 593.2083740234375, "epoch": 0.04934747145187602, "grad_norm": 0.7712890526722358, "kl": 0.086669921875, "learning_rate": 6.957882670345458e-07, "loss": 0.0001, "reward": 2.4697917699813843, "reward_std": 1.0504232347011566, "rewards/equation_reward_func": 0.5416666865348816, "rewards/format_reward_func": 1.9281250834465027, "step": 242 }, { "completion_length": 594.3958435058594, "epoch": 0.049755301794453505, "grad_norm": 0.8588310596425649, "kl": 0.0986328125, "learning_rate": 6.90705031860483e-07, "loss": 0.0001, "reward": 1.9615973234176636, "reward_std": 0.6058675646781921, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.9615971446037292, "step": 244 }, { "completion_length": 448.3125228881836, "epoch": 0.050163132137031, "grad_norm": 1.8069886458180306, "kl": 0.113525390625, "learning_rate": 6.855986244591103e-07, "loss": 0.0001, "reward": 2.5989930629730225, "reward_std": 0.7186898589134216, "rewards/equation_reward_func": 0.8333333730697632, "rewards/format_reward_func": 1.7656598091125488, "step": 246 }, { "completion_length": 487.33335876464844, "epoch": 0.05057096247960848, "grad_norm": 1.9638362270119054, "kl": 0.1494140625, "learning_rate": 6.804696653008574e-07, "loss": 0.0001, "reward": 2.645763874053955, "reward_std": 0.979635089635849, "rewards/equation_reward_func": 0.8333333730697632, "rewards/format_reward_func": 1.8124305605888367, "step": 248 }, { "completion_length": 536.5833740234375, "epoch": 0.050978792822185974, "grad_norm": 0.8482235665719111, "kl": 0.094482421875, "learning_rate": 6.753187775963772e-07, "loss": 0.0001, "reward": 1.9076389074325562, "reward_std": 0.5429915189743042, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.9076389074325562, "step": 250 }, { "completion_length": 398.5208435058594, "epoch": 0.05138662316476346, "grad_norm": 1.5501066370417451, "kl": 0.10986328125, "learning_rate": 6.701465872208216e-07, "loss": 0.0001, "reward": 2.95270836353302, "reward_std": 0.8335215449333191, "rewards/equation_reward_func": 1.166666716337204, "rewards/format_reward_func": 1.7860416173934937, "step": 252 }, { "completion_length": 427.0000305175781, "epoch": 0.051794453507340944, "grad_norm": 0.8535810763610331, "kl": 0.13330078125, "learning_rate": 6.649537226377914e-07, "loss": 0.0001, "reward": 2.610729217529297, "reward_std": 0.6564360558986664, "rewards/equation_reward_func": 0.7916666865348816, "rewards/format_reward_func": 1.8190626502037048, "step": 254 }, { "completion_length": 459.0208435058594, "epoch": 0.052202283849918436, "grad_norm": 1.253749573558056, "kl": 0.12646484375, "learning_rate": 6.597408148229741e-07, "loss": 0.0001, "reward": 2.7206597328186035, "reward_std": 0.5680619776248932, "rewards/equation_reward_func": 0.75, "rewards/format_reward_func": 1.970659613609314, "step": 256 }, { "completion_length": 459.16668701171875, "epoch": 0.05261011419249592, "grad_norm": 1.5903888848286962, "kl": 0.12060546875, "learning_rate": 6.545084971874736e-07, "loss": 0.0001, "reward": 2.6335763931274414, "reward_std": 0.6806207001209259, "rewards/equation_reward_func": 0.7083333730697632, "rewards/format_reward_func": 1.925243079662323, "step": 258 }, { "completion_length": 390.5208435058594, "epoch": 0.05301794453507341, "grad_norm": 1.5527056038492013, "kl": 0.118408203125, "learning_rate": 6.492574055008473e-07, "loss": 0.0001, "reward": 2.4415969848632812, "reward_std": 0.600139319896698, "rewards/equation_reward_func": 0.625, "rewards/format_reward_func": 1.81659734249115, "step": 260 }, { "completion_length": 531.9166870117188, "epoch": 0.0534257748776509, "grad_norm": 1.4622847388569014, "kl": 0.12255859375, "learning_rate": 6.439881778138531e-07, "loss": 0.0001, "reward": 2.7648611068725586, "reward_std": 0.6601312011480331, "rewards/equation_reward_func": 0.7083333730697632, "rewards/format_reward_func": 2.056527853012085, "step": 262 }, { "completion_length": 515.7916870117188, "epoch": 0.053833605220228384, "grad_norm": 1.386912182976191, "kl": 0.138427734375, "learning_rate": 6.387014543809223e-07, "loss": 0.0001, "reward": 2.7789584398269653, "reward_std": 0.6216670870780945, "rewards/equation_reward_func": 0.8333333730697632, "rewards/format_reward_func": 1.9456250667572021, "step": 264 }, { "completion_length": 350.0208435058594, "epoch": 0.054241435562805876, "grad_norm": 1.32827783707619, "kl": 0.14892578125, "learning_rate": 6.333978775823631e-07, "loss": 0.0001, "reward": 3.3326735496520996, "reward_std": 0.5917892754077911, "rewards/equation_reward_func": 1.5833333730697632, "rewards/format_reward_func": 1.7493401765823364, "step": 266 }, { "completion_length": 418.87501525878906, "epoch": 0.05464926590538336, "grad_norm": 1.7098008888477785, "kl": 0.162109375, "learning_rate": 6.280780918463057e-07, "loss": 0.0002, "reward": 2.589617967605591, "reward_std": 0.6024105995893478, "rewards/equation_reward_func": 0.75, "rewards/format_reward_func": 1.8396180868148804, "step": 268 }, { "completion_length": 513.5625152587891, "epoch": 0.055057096247960846, "grad_norm": 1.117454148344459, "kl": 0.16162109375, "learning_rate": 6.227427435703995e-07, "loss": 0.0002, "reward": 2.7452430725097656, "reward_std": 1.3093486428260803, "rewards/equation_reward_func": 1.125, "rewards/format_reward_func": 1.6202431321144104, "step": 270 }, { "completion_length": 616.9166870117188, "epoch": 0.05546492659053834, "grad_norm": 0.9401293165638854, "kl": 0.1728515625, "learning_rate": 6.173924810432704e-07, "loss": 0.0002, "reward": 2.637951374053955, "reward_std": 0.9178789854049683, "rewards/equation_reward_func": 0.625, "rewards/format_reward_func": 2.0129514336586, "step": 272 }, { "completion_length": 651.5833740234375, "epoch": 0.05587275693311582, "grad_norm": 0.9567005196766424, "kl": 0.14697265625, "learning_rate": 6.12027954365748e-07, "loss": 0.0001, "reward": 1.926597237586975, "reward_std": 0.8993740975856781, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 1.8849304914474487, "step": 274 }, { "completion_length": 634.5208435058594, "epoch": 0.05628058727569331, "grad_norm": 1.358076325982119, "kl": 0.13427734375, "learning_rate": 6.066498153718734e-07, "loss": 0.0001, "reward": 2.3687500953674316, "reward_std": 1.000350534915924, "rewards/equation_reward_func": 0.4166666865348816, "rewards/format_reward_func": 1.9520832300186157, "step": 276 }, { "completion_length": 517.4166870117188, "epoch": 0.0566884176182708, "grad_norm": 1.5910966658956645, "kl": 0.1484375, "learning_rate": 6.01258717549696e-07, "loss": 0.0001, "reward": 2.7329167127609253, "reward_std": 0.6304636597633362, "rewards/equation_reward_func": 1.0, "rewards/format_reward_func": 1.7329167127609253, "step": 278 }, { "completion_length": 693.8125305175781, "epoch": 0.057096247960848286, "grad_norm": 1.0227721122110067, "kl": 0.12451171875, "learning_rate": 5.958553159618692e-07, "loss": 0.0001, "reward": 2.247395873069763, "reward_std": 0.8254929631948471, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 2.0807292461395264, "step": 280 }, { "completion_length": 699.3125305175781, "epoch": 0.05750407830342578, "grad_norm": 0.9489795742265218, "kl": 0.1298828125, "learning_rate": 5.90440267166055e-07, "loss": 0.0001, "reward": 2.170659899711609, "reward_std": 0.7723036706447601, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 2.128993034362793, "step": 282 }, { "completion_length": 652.3333435058594, "epoch": 0.05791190864600326, "grad_norm": 1.2529290369484056, "kl": 0.13623046875, "learning_rate": 5.850142291351465e-07, "loss": 0.0001, "reward": 2.2495139837265015, "reward_std": 0.8483322262763977, "rewards/equation_reward_func": 0.2083333432674408, "rewards/format_reward_func": 2.0411804914474487, "step": 284 }, { "completion_length": 367.91668701171875, "epoch": 0.05831973898858075, "grad_norm": 1.40910503148467, "kl": 0.16064453125, "learning_rate": 5.795778611773197e-07, "loss": 0.0002, "reward": 3.3931944370269775, "reward_std": 0.7464114725589752, "rewards/equation_reward_func": 1.5833333730697632, "rewards/format_reward_func": 1.809861183166504, "step": 286 }, { "completion_length": 525.25, "epoch": 0.05872756933115824, "grad_norm": 1.4485621962260458, "kl": 0.150390625, "learning_rate": 5.741318238559209e-07, "loss": 0.0002, "reward": 3.194236159324646, "reward_std": 0.8501316905021667, "rewards/equation_reward_func": 1.166666716337204, "rewards/format_reward_func": 2.027569532394409, "step": 288 }, { "completion_length": 683.4166870117188, "epoch": 0.059135399673735725, "grad_norm": 1.1717608782080553, "kl": 0.1416015625, "learning_rate": 5.686767789092041e-07, "loss": 0.0001, "reward": 2.2353820204734802, "reward_std": 1.0596205294132233, "rewards/equation_reward_func": 0.2916666865348816, "rewards/format_reward_func": 1.9437153339385986, "step": 290 }, { "completion_length": 401.06251525878906, "epoch": 0.05954323001631321, "grad_norm": 1.469208845355834, "kl": 0.18115234375, "learning_rate": 5.632133891699231e-07, "loss": 0.0002, "reward": 3.5194097757339478, "reward_std": 0.6208974719047546, "rewards/equation_reward_func": 1.7083333730697632, "rewards/format_reward_func": 1.8110764622688293, "step": 292 }, { "completion_length": 726.2916870117188, "epoch": 0.0599510603588907, "grad_norm": 1.3400119923719016, "kl": 0.135009765625, "learning_rate": 5.577423184847931e-07, "loss": 0.0001, "reward": 2.288576364517212, "reward_std": 0.4163784384727478, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 2.288576364517212, "step": 294 }, { "completion_length": 408.81251525878906, "epoch": 0.06035889070146819, "grad_norm": 1.6377898052337172, "kl": 0.16259765625, "learning_rate": 5.522642316338268e-07, "loss": 0.0002, "reward": 3.5491667985916138, "reward_std": 0.6619178652763367, "rewards/equation_reward_func": 1.6666666865348816, "rewards/format_reward_func": 1.8825000524520874, "step": 296 }, { "completion_length": 662.7916870117188, "epoch": 0.06076672104404568, "grad_norm": 1.0601282664430811, "kl": 0.16845703125, "learning_rate": 5.467797942495589e-07, "loss": 0.0002, "reward": 2.235729455947876, "reward_std": 1.0819981396198273, "rewards/equation_reward_func": 0.3333333544433117, "rewards/format_reward_func": 1.9023959636688232, "step": 298 }, { "completion_length": 563.1875305175781, "epoch": 0.061174551386623165, "grad_norm": 0.9614419208572907, "kl": 0.193359375, "learning_rate": 5.412896727361662e-07, "loss": 0.0002, "reward": 2.6000348329544067, "reward_std": 0.9562007784843445, "rewards/equation_reward_func": 0.7916666865348816, "rewards/format_reward_func": 1.8083681464195251, "step": 300 }, { "completion_length": 819.2916870117188, "epoch": 0.06158238172920065, "grad_norm": 0.9771200528468638, "kl": 0.128662109375, "learning_rate": 5.357945341884935e-07, "loss": 0.0001, "reward": 1.7778472304344177, "reward_std": 1.077535629272461, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.7778472900390625, "step": 302 }, { "completion_length": 553.4166870117188, "epoch": 0.06199021207177814, "grad_norm": 1.1955694878738563, "kl": 0.23388671875, "learning_rate": 5.302950463109969e-07, "loss": 0.0002, "reward": 2.829687714576721, "reward_std": 1.1161695718765259, "rewards/equation_reward_func": 0.916666716337204, "rewards/format_reward_func": 1.9130208492279053, "step": 304 }, { "completion_length": 710.7291870117188, "epoch": 0.06239804241435563, "grad_norm": 1.3779449810449336, "kl": 0.1650390625, "learning_rate": 5.247918773366111e-07, "loss": 0.0002, "reward": 2.022604286670685, "reward_std": 0.8615612387657166, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 1.9392709136009216, "step": 306 }, { "completion_length": 589.2083435058594, "epoch": 0.06280587275693311, "grad_norm": 1.1497542405094912, "kl": 0.1552734375, "learning_rate": 5.192856959455552e-07, "loss": 0.0002, "reward": 3.0122569799423218, "reward_std": 0.8370742797851562, "rewards/equation_reward_func": 0.9583333544433117, "rewards/format_reward_func": 2.0539236068725586, "step": 308 }, { "completion_length": 373.6041717529297, "epoch": 0.0632137030995106, "grad_norm": 1.527186652658105, "kl": 0.24267578125, "learning_rate": 5.137771711840811e-07, "loss": 0.0002, "reward": 3.4156596660614014, "reward_std": 0.5625910460948944, "rewards/equation_reward_func": 1.6250000596046448, "rewards/format_reward_func": 1.790659785270691, "step": 310 }, { "completion_length": 395.37501525878906, "epoch": 0.0636215334420881, "grad_norm": 1.7273540516921173, "kl": 0.20361328125, "learning_rate": 5.082669723831793e-07, "loss": 0.0002, "reward": 3.51725697517395, "reward_std": 0.6541823446750641, "rewards/equation_reward_func": 1.6666667461395264, "rewards/format_reward_func": 1.8505903482437134, "step": 312 }, { "completion_length": 385.5208435058594, "epoch": 0.06402936378466557, "grad_norm": 1.698447178314297, "kl": 0.2001953125, "learning_rate": 5.027557690772503e-07, "loss": 0.0002, "reward": 3.766666889190674, "reward_std": 0.5374718904495239, "rewards/equation_reward_func": 1.9166667461395264, "rewards/format_reward_func": 1.850000023841858, "step": 314 }, { "completion_length": 597.7083435058594, "epoch": 0.06443719412724307, "grad_norm": 1.2167733375338632, "kl": 0.25439453125, "learning_rate": 4.972442309227498e-07, "loss": 0.0003, "reward": 2.8143749237060547, "reward_std": 0.9374454021453857, "rewards/equation_reward_func": 0.9583333544433117, "rewards/format_reward_func": 1.856041669845581, "step": 316 }, { "completion_length": 687.9375305175781, "epoch": 0.06484502446982056, "grad_norm": 1.1609904401618012, "kl": 0.17822265625, "learning_rate": 4.917330276168208e-07, "loss": 0.0002, "reward": 2.6170140504837036, "reward_std": 1.1427516341209412, "rewards/equation_reward_func": 0.6250000409781933, "rewards/format_reward_func": 1.9920140504837036, "step": 318 }, { "completion_length": 684.1250305175781, "epoch": 0.06525285481239804, "grad_norm": 1.0056701516120197, "kl": 0.20947265625, "learning_rate": 4.86222828815919e-07, "loss": 0.0002, "reward": 2.5053821802139282, "reward_std": 0.9234158992767334, "rewards/equation_reward_func": 0.4166666865348816, "rewards/format_reward_func": 2.088715434074402, "step": 320 }, { "completion_length": 673.8750305175781, "epoch": 0.06566068515497553, "grad_norm": 1.0975585144657385, "kl": 0.18798828125, "learning_rate": 4.807143040544446e-07, "loss": 0.0002, "reward": 3.1653473377227783, "reward_std": 1.1166218519210815, "rewards/equation_reward_func": 1.0416666865348816, "rewards/format_reward_func": 2.123680830001831, "step": 322 }, { "completion_length": 830.4791870117188, "epoch": 0.06606851549755302, "grad_norm": 1.0602033792436276, "kl": 0.1513671875, "learning_rate": 4.752081226633888e-07, "loss": 0.0002, "reward": 1.821250081062317, "reward_std": 1.0921660661697388, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.821250081062317, "step": 324 }, { "completion_length": 639.4375305175781, "epoch": 0.0664763458401305, "grad_norm": 1.100729586233426, "kl": 0.1552734375, "learning_rate": 4.697049536890033e-07, "loss": 0.0002, "reward": 2.8303472995758057, "reward_std": 0.8279085159301758, "rewards/equation_reward_func": 0.7916666865348816, "rewards/format_reward_func": 2.038680672645569, "step": 326 }, { "completion_length": 686.9375305175781, "epoch": 0.06688417618270799, "grad_norm": 1.0245543586959098, "kl": 0.17041015625, "learning_rate": 4.642054658115066e-07, "loss": 0.0002, "reward": 2.826840400695801, "reward_std": 0.9287701547145844, "rewards/equation_reward_func": 0.7916666865348816, "rewards/format_reward_func": 2.0351736545562744, "step": 328 }, { "completion_length": 527.375, "epoch": 0.06729200652528548, "grad_norm": 1.0458228845510584, "kl": 0.173828125, "learning_rate": 4.5871032726383385e-07, "loss": 0.0002, "reward": 3.5178821086883545, "reward_std": 1.0241894721984863, "rewards/equation_reward_func": 1.5416666865348816, "rewards/format_reward_func": 1.9762153625488281, "step": 330 }, { "completion_length": 574.0000152587891, "epoch": 0.06769983686786298, "grad_norm": 1.3045692619665175, "kl": 0.1962890625, "learning_rate": 4.532202057504411e-07, "loss": 0.0002, "reward": 3.1286113262176514, "reward_std": 1.1157885491847992, "rewards/equation_reward_func": 1.125, "rewards/format_reward_func": 2.003611207008362, "step": 332 }, { "completion_length": 783.9375305175781, "epoch": 0.06810766721044045, "grad_norm": 1.0379555283649509, "kl": 0.15966796875, "learning_rate": 4.477357683661733e-07, "loss": 0.0002, "reward": 2.033749997615814, "reward_std": 0.8268265128135681, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 2.0337501168251038, "step": 334 }, { "completion_length": 748.5416870117188, "epoch": 0.06851549755301795, "grad_norm": 1.0608562150572372, "kl": 0.3369140625, "learning_rate": 4.4225768151520694e-07, "loss": 0.0003, "reward": 2.0848264694213867, "reward_std": 0.8205204904079437, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 2.0848264694213867, "step": 336 }, { "completion_length": 815.5625305175781, "epoch": 0.06892332789559544, "grad_norm": 1.0418888295786408, "kl": 0.18310546875, "learning_rate": 4.3678661083007685e-07, "loss": 0.0002, "reward": 1.7861458659172058, "reward_std": 1.0723278522491455, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 1.7861458659172058, "step": 338 }, { "completion_length": 555.1666717529297, "epoch": 0.06933115823817292, "grad_norm": 1.4518474976562326, "kl": 0.17236328125, "learning_rate": 4.313232210907959e-07, "loss": 0.0002, "reward": 3.1211459636688232, "reward_std": 0.9402068853378296, "rewards/equation_reward_func": 1.0416666865348816, "rewards/format_reward_func": 2.0794793367385864, "step": 340 }, { "completion_length": 500.79168701171875, "epoch": 0.06973898858075041, "grad_norm": 1.5942769271225012, "kl": 0.20947265625, "learning_rate": 4.258681761440789e-07, "loss": 0.0002, "reward": 3.415902853012085, "reward_std": 0.6841486990451813, "rewards/equation_reward_func": 1.3750000298023224, "rewards/format_reward_func": 2.04090279340744, "step": 342 }, { "completion_length": 648.5625, "epoch": 0.0701468189233279, "grad_norm": 1.063650258930375, "kl": 0.15478515625, "learning_rate": 4.2042213882268025e-07, "loss": 0.0002, "reward": 2.6829168796539307, "reward_std": 0.8894191086292267, "rewards/equation_reward_func": 0.5416666865348816, "rewards/format_reward_func": 2.1412501335144043, "step": 344 }, { "completion_length": 561.6666870117188, "epoch": 0.07055464926590538, "grad_norm": 1.457579727519852, "kl": 0.17724609375, "learning_rate": 4.149857708648535e-07, "loss": 0.0002, "reward": 2.8364583253860474, "reward_std": 0.47896429151296616, "rewards/equation_reward_func": 0.6666666865348816, "rewards/format_reward_func": 2.1697916984558105, "step": 346 }, { "completion_length": 642.3541870117188, "epoch": 0.07096247960848287, "grad_norm": 1.3912364072475296, "kl": 0.16064453125, "learning_rate": 4.095597328339452e-07, "loss": 0.0002, "reward": 2.601736068725586, "reward_std": 0.6701973676681519, "rewards/equation_reward_func": 0.3333333544433117, "rewards/format_reward_func": 2.2684028148651123, "step": 348 }, { "completion_length": 565.8541870117188, "epoch": 0.07137030995106036, "grad_norm": 1.4335488442307787, "kl": 0.15869140625, "learning_rate": 4.041446840381309e-07, "loss": 0.0002, "reward": 2.8512500524520874, "reward_std": 0.6541395485401154, "rewards/equation_reward_func": 0.7916666865348816, "rewards/format_reward_func": 2.0595834851264954, "step": 350 }, { "completion_length": 592.4583435058594, "epoch": 0.07177814029363784, "grad_norm": 1.6711244477487293, "kl": 0.1640625, "learning_rate": 3.98741282450304e-07, "loss": 0.0002, "reward": 2.683958411216736, "reward_std": 0.9120919704437256, "rewards/equation_reward_func": 0.5833333730697632, "rewards/format_reward_func": 2.1006250977516174, "step": 352 }, { "completion_length": 647.5625305175781, "epoch": 0.07218597063621533, "grad_norm": 1.3418439173813352, "kl": 0.1611328125, "learning_rate": 3.9335018462812664e-07, "loss": 0.0002, "reward": 2.5612502098083496, "reward_std": 0.9770323932170868, "rewards/equation_reward_func": 0.5, "rewards/format_reward_func": 2.06125009059906, "step": 354 }, { "completion_length": 593.5, "epoch": 0.07259380097879282, "grad_norm": 0.8818594196791458, "kl": 0.15478515625, "learning_rate": 3.879720456342521e-07, "loss": 0.0002, "reward": 2.828229308128357, "reward_std": 0.7171844244003296, "rewards/equation_reward_func": 0.7083333730697632, "rewards/format_reward_func": 2.119895815849304, "step": 356 }, { "completion_length": 458.3125, "epoch": 0.07300163132137032, "grad_norm": 1.4981060215710775, "kl": 0.1640625, "learning_rate": 3.8260751895672954e-07, "loss": 0.0002, "reward": 3.318923592567444, "reward_std": 0.8009838759899139, "rewards/equation_reward_func": 1.3333333432674408, "rewards/format_reward_func": 1.9855904579162598, "step": 358 }, { "completion_length": 455.9166717529297, "epoch": 0.0734094616639478, "grad_norm": 1.3586016748782879, "kl": 0.18408203125, "learning_rate": 3.772572564296004e-07, "loss": 0.0002, "reward": 3.7097569704055786, "reward_std": 0.5928686857223511, "rewards/equation_reward_func": 1.7083333730697632, "rewards/format_reward_func": 2.001423716545105, "step": 360 }, { "completion_length": 673.6875305175781, "epoch": 0.07381729200652529, "grad_norm": 1.2301932410337926, "kl": 0.1787109375, "learning_rate": 3.719219081536942e-07, "loss": 0.0002, "reward": 2.9322917461395264, "reward_std": 1.2178776860237122, "rewards/equation_reward_func": 0.8333333730697632, "rewards/format_reward_func": 2.098958373069763, "step": 362 }, { "completion_length": 621.8333435058594, "epoch": 0.07422512234910278, "grad_norm": 1.2515697084532456, "kl": 0.15673828125, "learning_rate": 3.666021224176369e-07, "loss": 0.0002, "reward": 3.1526390314102173, "reward_std": 1.0030421912670135, "rewards/equation_reward_func": 0.9166666865348816, "rewards/format_reward_func": 2.235972285270691, "step": 364 }, { "completion_length": 615.0000305175781, "epoch": 0.07463295269168026, "grad_norm": 1.5169652601615895, "kl": 0.19091796875, "learning_rate": 3.612985456190778e-07, "loss": 0.0002, "reward": 2.936007022857666, "reward_std": 0.7388836741447449, "rewards/equation_reward_func": 0.875, "rewards/format_reward_func": 2.0610069632530212, "step": 366 }, { "completion_length": 592.9583435058594, "epoch": 0.07504078303425775, "grad_norm": 1.4869853846405452, "kl": 0.1611328125, "learning_rate": 3.56011822186147e-07, "loss": 0.0002, "reward": 3.217986226081848, "reward_std": 0.7202288508415222, "rewards/equation_reward_func": 1.0416666716337204, "rewards/format_reward_func": 2.1763195991516113, "step": 368 }, { "completion_length": 696.0, "epoch": 0.07544861337683524, "grad_norm": 1.3749187528497846, "kl": 0.20947265625, "learning_rate": 3.507425944991528e-07, "loss": 0.0002, "reward": 2.269930601119995, "reward_std": 0.710529625415802, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 2.186597466468811, "step": 370 }, { "completion_length": 557.3125305175781, "epoch": 0.07585644371941272, "grad_norm": 1.4000574837745792, "kl": 0.16796875, "learning_rate": 3.454915028125263e-07, "loss": 0.0002, "reward": 3.027916669845581, "reward_std": 1.0261832475662231, "rewards/equation_reward_func": 1.0416666865348816, "rewards/format_reward_func": 1.9862500429153442, "step": 372 }, { "completion_length": 493.1875, "epoch": 0.07626427406199021, "grad_norm": 1.3346862160598578, "kl": 0.17724609375, "learning_rate": 3.4025918517702593e-07, "loss": 0.0002, "reward": 3.429097294807434, "reward_std": 0.7599293291568756, "rewards/equation_reward_func": 1.4166666865348816, "rewards/format_reward_func": 2.012430787086487, "step": 374 }, { "completion_length": 559.7916717529297, "epoch": 0.0766721044045677, "grad_norm": 1.674692584956449, "kl": 0.25830078125, "learning_rate": 3.3504627736220857e-07, "loss": 0.0003, "reward": 3.2719098329544067, "reward_std": 0.7151365131139755, "rewards/equation_reward_func": 1.2083333730697632, "rewards/format_reward_func": 2.063576579093933, "step": 376 }, { "completion_length": 504.3125, "epoch": 0.07707993474714518, "grad_norm": 1.5590357023489072, "kl": 0.21337890625, "learning_rate": 3.2985341277917846e-07, "loss": 0.0002, "reward": 2.8944443464279175, "reward_std": 0.7131877541542053, "rewards/equation_reward_func": 0.8750000409781933, "rewards/format_reward_func": 2.0194445848464966, "step": 378 }, { "completion_length": 735.75, "epoch": 0.07748776508972267, "grad_norm": 1.068431271506457, "kl": 0.1865234375, "learning_rate": 3.2468122240362285e-07, "loss": 0.0002, "reward": 2.7177083492279053, "reward_std": 0.7962678074836731, "rewards/equation_reward_func": 0.4166666865348816, "rewards/format_reward_func": 2.301041841506958, "step": 380 }, { "completion_length": 796.8541870117188, "epoch": 0.07789559543230017, "grad_norm": 1.1087256773204297, "kl": 0.162109375, "learning_rate": 3.195303346991427e-07, "loss": 0.0002, "reward": 2.0952779054641724, "reward_std": 0.8598673939704895, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 2.0952779054641724, "step": 382 }, { "completion_length": 570.0625152587891, "epoch": 0.07830342577487764, "grad_norm": 1.4382086945561208, "kl": 0.17578125, "learning_rate": 3.1440137554088953e-07, "loss": 0.0002, "reward": 2.951319456100464, "reward_std": 0.7555558383464813, "rewards/equation_reward_func": 0.875, "rewards/format_reward_func": 2.076319396495819, "step": 384 }, { "completion_length": 545.7291870117188, "epoch": 0.07871125611745514, "grad_norm": 1.1023634840652674, "kl": 0.20166015625, "learning_rate": 3.092949681395169e-07, "loss": 0.0002, "reward": 3.21753466129303, "reward_std": 1.1398820281028748, "rewards/equation_reward_func": 1.3333333730697632, "rewards/format_reward_func": 1.8842013478279114, "step": 386 }, { "completion_length": 540.5625152587891, "epoch": 0.07911908646003263, "grad_norm": 1.6785853811175349, "kl": 0.17529296875, "learning_rate": 3.042117329654544e-07, "loss": 0.0002, "reward": 3.2422919273376465, "reward_std": 0.8742612600326538, "rewards/equation_reward_func": 1.0833333432674408, "rewards/format_reward_func": 2.1589584350585938, "step": 388 }, { "completion_length": 437.12501525878906, "epoch": 0.07952691680261012, "grad_norm": 1.719678147524432, "kl": 0.25732421875, "learning_rate": 2.9915228767351535e-07, "loss": 0.0003, "reward": 3.636701464653015, "reward_std": 0.723703920841217, "rewards/equation_reward_func": 1.6666666865348816, "rewards/format_reward_func": 1.970034897327423, "step": 390 }, { "completion_length": 665.4375305175781, "epoch": 0.0799347471451876, "grad_norm": 1.2489486438524378, "kl": 0.2080078125, "learning_rate": 2.941172470278476e-07, "loss": 0.0002, "reward": 2.668472409248352, "reward_std": 0.8615556359291077, "rewards/equation_reward_func": 0.541666679084301, "rewards/format_reward_func": 2.1268056631088257, "step": 392 }, { "completion_length": 816.0208435058594, "epoch": 0.08034257748776509, "grad_norm": 0.947304383944691, "kl": 0.18359375, "learning_rate": 2.89107222827234e-07, "loss": 0.0002, "reward": 2.104305624961853, "reward_std": 0.866163969039917, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 2.1043055057525635, "step": 394 }, { "completion_length": 765.4791870117188, "epoch": 0.08075040783034258, "grad_norm": 1.400028124019858, "kl": 0.19287109375, "learning_rate": 2.841228238307536e-07, "loss": 0.0002, "reward": 2.2613543272018433, "reward_std": 0.5083828084170818, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 2.2613543272018433, "step": 396 }, { "completion_length": 781.6250305175781, "epoch": 0.08115823817292006, "grad_norm": 1.0827871893574141, "kl": 0.1767578125, "learning_rate": 2.79164655683813e-07, "loss": 0.0002, "reward": 2.1505903601646423, "reward_std": 0.7696040868759155, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 2.1505903005599976, "step": 398 }, { "completion_length": 493.43751525878906, "epoch": 0.08156606851549755, "grad_norm": 1.2921133198533294, "kl": 0.791015625, "learning_rate": 2.742333208445554e-07, "loss": 0.0008, "reward": 3.811944603919983, "reward_std": 0.5879083275794983, "rewards/equation_reward_func": 1.7500000596046448, "rewards/format_reward_func": 2.061944365501404, "step": 400 }, { "completion_length": 746.0625305175781, "epoch": 0.08197389885807504, "grad_norm": 1.305143762318282, "kl": 0.17529296875, "learning_rate": 2.6932941851065615e-07, "loss": 0.0002, "reward": 2.444791793823242, "reward_std": 0.4394510090351105, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 2.4031251668930054, "step": 402 }, { "completion_length": 544.4791717529297, "epoch": 0.08238172920065252, "grad_norm": 1.1953377442351993, "kl": 0.1748046875, "learning_rate": 2.6445354454651636e-07, "loss": 0.0002, "reward": 3.572013735771179, "reward_std": 0.8854174613952637, "rewards/equation_reward_func": 1.4583333730697632, "rewards/format_reward_func": 2.113680601119995, "step": 404 }, { "completion_length": 804.7708740234375, "epoch": 0.08278955954323002, "grad_norm": 0.9435020676134209, "kl": 0.2001953125, "learning_rate": 2.596062914108601e-07, "loss": 0.0002, "reward": 2.238854169845581, "reward_std": 0.7229233682155609, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 2.238854169845581, "step": 406 }, { "completion_length": 771.4166870117188, "epoch": 0.08319738988580751, "grad_norm": 1.0179706632601526, "kl": 0.171875, "learning_rate": 2.547882480847461e-07, "loss": 0.0002, "reward": 2.2952778339385986, "reward_std": 0.6089069843292236, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 2.2952778339385986, "step": 408 }, { "completion_length": 615.0625, "epoch": 0.08360522022838499, "grad_norm": 1.3095508272095453, "kl": 0.2060546875, "learning_rate": 2.500000000000001e-07, "loss": 0.0002, "reward": 2.916632056236267, "reward_std": 1.1873834133148193, "rewards/equation_reward_func": 0.875, "rewards/format_reward_func": 2.041632056236267, "step": 410 }, { "completion_length": 522.5833587646484, "epoch": 0.08401305057096248, "grad_norm": 1.3099266975063921, "kl": 0.1826171875, "learning_rate": 2.452421289680826e-07, "loss": 0.0002, "reward": 3.69243061542511, "reward_std": 0.7064912915229797, "rewards/equation_reward_func": 1.5833333730697632, "rewards/format_reward_func": 2.1090973615646362, "step": 412 }, { "completion_length": 554.5625305175781, "epoch": 0.08442088091353997, "grad_norm": 1.3331866727652546, "kl": 0.18212890625, "learning_rate": 2.4051521310939254e-07, "loss": 0.0002, "reward": 3.463784694671631, "reward_std": 0.8228816390037537, "rewards/equation_reward_func": 1.291666716337204, "rewards/format_reward_func": 2.1721181869506836, "step": 414 }, { "completion_length": 422.85418701171875, "epoch": 0.08482871125611746, "grad_norm": 1.4824642737659157, "kl": 0.171875, "learning_rate": 2.3581982678302058e-07, "loss": 0.0002, "reward": 3.8071876764297485, "reward_std": 0.43291839957237244, "rewards/equation_reward_func": 1.8750000596046448, "rewards/format_reward_func": 1.9321874976158142, "step": 416 }, { "completion_length": 674.2916870117188, "epoch": 0.08523654159869494, "grad_norm": 1.5561510539962522, "kl": 0.19189453125, "learning_rate": 2.3115654051696092e-07, "loss": 0.0002, "reward": 2.5582985877990723, "reward_std": 0.8232472538948059, "rewards/equation_reward_func": 0.3333333432674408, "rewards/format_reward_func": 2.2249653339385986, "step": 418 }, { "completion_length": 710.0208435058594, "epoch": 0.08564437194127243, "grad_norm": 1.2191039991941799, "kl": 0.205078125, "learning_rate": 2.2652592093878665e-07, "loss": 0.0002, "reward": 2.6046180725097656, "reward_std": 1.0610361099243164, "rewards/equation_reward_func": 0.4583333544433117, "rewards/format_reward_func": 2.146284818649292, "step": 420 }, { "completion_length": 609.6250305175781, "epoch": 0.08605220228384992, "grad_norm": 1.5426998553595659, "kl": 0.38818359375, "learning_rate": 2.2192853070679967e-07, "loss": 0.0004, "reward": 2.992326498031616, "reward_std": 0.6148561537265778, "rewards/equation_reward_func": 0.7916666865348816, "rewards/format_reward_func": 2.2006598711013794, "step": 422 }, { "completion_length": 603.0833587646484, "epoch": 0.0864600326264274, "grad_norm": 1.1676070955306959, "kl": 0.17626953125, "learning_rate": 2.1736492844166404e-07, "loss": 0.0002, "reward": 2.8410418033599854, "reward_std": 0.7288043797016144, "rewards/equation_reward_func": 0.75, "rewards/format_reward_func": 2.091041922569275, "step": 424 }, { "completion_length": 686.1041870117188, "epoch": 0.0868678629690049, "grad_norm": 1.2093809230932122, "kl": 0.16259765625, "learning_rate": 2.128356686585282e-07, "loss": 0.0002, "reward": 2.547639012336731, "reward_std": 1.0525963008403778, "rewards/equation_reward_func": 0.4583333544433117, "rewards/format_reward_func": 2.089305877685547, "step": 426 }, { "completion_length": 528.4166870117188, "epoch": 0.08727569331158239, "grad_norm": 1.493705154956843, "kl": 0.19921875, "learning_rate": 2.0834130169964692e-07, "loss": 0.0002, "reward": 3.80138897895813, "reward_std": 0.7043006718158722, "rewards/equation_reward_func": 1.6666667461395264, "rewards/format_reward_func": 2.1347222328186035, "step": 428 }, { "completion_length": 595.5416870117188, "epoch": 0.08768352365415986, "grad_norm": 1.0806302437363526, "kl": 0.162109375, "learning_rate": 2.0388237366751003e-07, "loss": 0.0002, "reward": 3.173958420753479, "reward_std": 1.080767273902893, "rewards/equation_reward_func": 1.125, "rewards/format_reward_func": 2.0489583611488342, "step": 430 }, { "completion_length": 461.81251525878906, "epoch": 0.08809135399673736, "grad_norm": 1.3305612303160375, "kl": 0.2109375, "learning_rate": 1.9945942635848745e-07, "loss": 0.0002, "reward": 3.8538542985916138, "reward_std": 0.5665659308433533, "rewards/equation_reward_func": 1.8333333730697632, "rewards/format_reward_func": 2.0205209255218506, "step": 432 }, { "completion_length": 655.4375, "epoch": 0.08849918433931485, "grad_norm": 1.5166587801432987, "kl": 0.23876953125, "learning_rate": 1.950729971969955e-07, "loss": 0.0002, "reward": 2.8268750309944153, "reward_std": 0.879076361656189, "rewards/equation_reward_func": 0.7916666865348816, "rewards/format_reward_func": 2.0352084040641785, "step": 434 }, { "completion_length": 768.7083435058594, "epoch": 0.08890701468189233, "grad_norm": 1.3585261445866479, "kl": 0.15380859375, "learning_rate": 1.9072361917019536e-07, "loss": 0.0002, "reward": 2.529687523841858, "reward_std": 0.5843232274055481, "rewards/equation_reward_func": 0.125, "rewards/format_reward_func": 2.4046876430511475, "step": 436 }, { "completion_length": 642.6458435058594, "epoch": 0.08931484502446982, "grad_norm": 1.2793508732682657, "kl": 0.18505859375, "learning_rate": 1.8641182076323148e-07, "loss": 0.0002, "reward": 3.2028820514678955, "reward_std": 0.9111791253089905, "rewards/equation_reward_func": 0.8750000298023224, "rewards/format_reward_func": 2.327882170677185, "step": 438 }, { "completion_length": 590.1041870117188, "epoch": 0.08972267536704731, "grad_norm": 1.243315094285279, "kl": 0.1865234375, "learning_rate": 1.8213812589501608e-07, "loss": 0.0002, "reward": 3.199236273765564, "reward_std": 0.7947587668895721, "rewards/equation_reward_func": 1.083333358168602, "rewards/format_reward_func": 2.115902900695801, "step": 440 }, { "completion_length": 667.2083435058594, "epoch": 0.0901305057096248, "grad_norm": 1.4563035634925794, "kl": 0.15966796875, "learning_rate": 1.7790305385456795e-07, "loss": 0.0002, "reward": 2.744722366333008, "reward_std": 0.8301202952861786, "rewards/equation_reward_func": 0.5416666865348816, "rewards/format_reward_func": 2.2030556201934814, "step": 442 }, { "completion_length": 628.7916870117188, "epoch": 0.09053833605220228, "grad_norm": 1.5549898799881567, "kl": 0.18359375, "learning_rate": 1.7370711923791564e-07, "loss": 0.0002, "reward": 2.978472352027893, "reward_std": 0.7657686173915863, "rewards/equation_reward_func": 0.875, "rewards/format_reward_func": 2.1034722328186035, "step": 444 }, { "completion_length": 589.9375152587891, "epoch": 0.09094616639477977, "grad_norm": 1.3961044134596396, "kl": 0.16455078125, "learning_rate": 1.6955083188556946e-07, "loss": 0.0002, "reward": 2.9413541555404663, "reward_std": 0.869944304227829, "rewards/equation_reward_func": 0.75, "rewards/format_reward_func": 2.191354274749756, "step": 446 }, { "completion_length": 588.5625305175781, "epoch": 0.09135399673735727, "grad_norm": 1.2868775692231729, "kl": 0.16845703125, "learning_rate": 1.6543469682057104e-07, "loss": 0.0002, "reward": 3.0225348472595215, "reward_std": 0.6649808585643768, "rewards/equation_reward_func": 0.9166666679084301, "rewards/format_reward_func": 2.105868101119995, "step": 448 }, { "completion_length": 619.4791717529297, "epoch": 0.09176182707993474, "grad_norm": 1.3540986287392276, "kl": 0.234375, "learning_rate": 1.6135921418712955e-07, "loss": 0.0002, "reward": 3.06413197517395, "reward_std": 0.5509577691555023, "rewards/equation_reward_func": 0.9166666865348816, "rewards/format_reward_func": 2.147465467453003, "step": 450 }, { "completion_length": 643.9166870117188, "epoch": 0.09216965742251224, "grad_norm": 1.3600242782766807, "kl": 0.169921875, "learning_rate": 1.5732487918985015e-07, "loss": 0.0002, "reward": 2.942257046699524, "reward_std": 0.6768557727336884, "rewards/equation_reward_func": 0.6666666865348816, "rewards/format_reward_func": 2.275590419769287, "step": 452 }, { "completion_length": 748.5625305175781, "epoch": 0.09257748776508973, "grad_norm": 1.0093428243067948, "kl": 0.15673828125, "learning_rate": 1.533321820335624e-07, "loss": 0.0002, "reward": 2.5672223567962646, "reward_std": 0.9604451656341553, "rewards/equation_reward_func": 0.3333333432674408, "rewards/format_reward_func": 2.233889102935791, "step": 454 }, { "completion_length": 631.9791870117188, "epoch": 0.0929853181076672, "grad_norm": 1.4428676303535133, "kl": 0.17236328125, "learning_rate": 1.493816078637557e-07, "loss": 0.0002, "reward": 3.1071181297302246, "reward_std": 0.5816036462783813, "rewards/equation_reward_func": 0.875, "rewards/format_reward_func": 2.232118010520935, "step": 456 }, { "completion_length": 745.5625, "epoch": 0.0933931484502447, "grad_norm": 0.9921529304927418, "kl": 0.16748046875, "learning_rate": 1.4547363670763136e-07, "loss": 0.0002, "reward": 2.374131917953491, "reward_std": 0.40968185663223267, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 2.374131917953491, "step": 458 }, { "completion_length": 705.0833740234375, "epoch": 0.09380097879282219, "grad_norm": 1.0713444392216345, "kl": 0.18310546875, "learning_rate": 1.4160874341577444e-07, "loss": 0.0002, "reward": 2.8997570276260376, "reward_std": 0.7406170666217804, "rewards/equation_reward_func": 0.5833333730697632, "rewards/format_reward_func": 2.3164236545562744, "step": 460 }, { "completion_length": 635.0000305175781, "epoch": 0.09420880913539967, "grad_norm": 1.3688348766889487, "kl": 0.18115234375, "learning_rate": 1.3778739760445552e-07, "loss": 0.0002, "reward": 2.9428473711013794, "reward_std": 0.6385838389396667, "rewards/equation_reward_func": 0.7083333730697632, "rewards/format_reward_func": 2.234513998031616, "step": 462 }, { "completion_length": 637.1458435058594, "epoch": 0.09461663947797716, "grad_norm": 1.07823940444348, "kl": 0.1787109375, "learning_rate": 1.3401006359856916e-07, "loss": 0.0002, "reward": 3.116041660308838, "reward_std": 0.6116780638694763, "rewards/equation_reward_func": 0.9166666865348816, "rewards/format_reward_func": 2.1993749141693115, "step": 464 }, { "completion_length": 671.1041870117188, "epoch": 0.09502446982055465, "grad_norm": 1.0251069650416453, "kl": 0.1611328125, "learning_rate": 1.3027720037521395e-07, "loss": 0.0002, "reward": 3.0604861974716187, "reward_std": 0.6963988989591599, "rewards/equation_reward_func": 0.9583333730697632, "rewards/format_reward_func": 2.1021528244018555, "step": 466 }, { "completion_length": 508.33335876464844, "epoch": 0.09543230016313213, "grad_norm": 1.2041214798556659, "kl": 0.1513671875, "learning_rate": 1.2658926150792322e-07, "loss": 0.0002, "reward": 3.716770887374878, "reward_std": 0.8307555913925171, "rewards/equation_reward_func": 1.6666666865348816, "rewards/format_reward_func": 2.050104081630707, "step": 468 }, { "completion_length": 702.5625, "epoch": 0.09584013050570962, "grad_norm": 1.4167156857768421, "kl": 0.171875, "learning_rate": 1.229466951115519e-07, "loss": 0.0002, "reward": 2.8170487880706787, "reward_std": 0.932531863451004, "rewards/equation_reward_func": 0.6666666865348816, "rewards/format_reward_func": 2.1503820419311523, "step": 470 }, { "completion_length": 440.3958435058594, "epoch": 0.09624796084828711, "grad_norm": 1.2804118870894239, "kl": 0.205078125, "learning_rate": 1.193499437878277e-07, "loss": 0.0002, "reward": 3.693055510520935, "reward_std": 0.6622753441333771, "rewards/equation_reward_func": 1.7083333730697632, "rewards/format_reward_func": 1.9847222566604614, "step": 472 }, { "completion_length": 654.7291870117188, "epoch": 0.0966557911908646, "grad_norm": 1.4440438187505384, "kl": 0.20458984375, "learning_rate": 1.1579944457157059e-07, "loss": 0.0002, "reward": 2.8649654388427734, "reward_std": 0.7565539479255676, "rewards/equation_reward_func": 0.6666666865348816, "rewards/format_reward_func": 2.198298692703247, "step": 474 }, { "completion_length": 618.1250305175781, "epoch": 0.09706362153344208, "grad_norm": 1.2861995217349655, "kl": 0.16748046875, "learning_rate": 1.1229562887758925e-07, "loss": 0.0002, "reward": 2.932604193687439, "reward_std": 0.7926245033740997, "rewards/equation_reward_func": 0.7916666679084301, "rewards/format_reward_func": 2.1409374475479126, "step": 476 }, { "completion_length": 566.7083435058594, "epoch": 0.09747145187601958, "grad_norm": 1.2073113401945212, "kl": 0.1943359375, "learning_rate": 1.088389224482617e-07, "loss": 0.0002, "reward": 3.6285417079925537, "reward_std": 0.7963749468326569, "rewards/equation_reward_func": 1.4583333730697632, "rewards/format_reward_func": 2.170208215713501, "step": 478 }, { "completion_length": 616.9791870117188, "epoch": 0.09787928221859707, "grad_norm": 3.5720337342156494, "kl": 0.1904296875, "learning_rate": 1.0542974530180327e-07, "loss": 0.0002, "reward": 3.047569513320923, "reward_std": 0.9450699985027313, "rewards/equation_reward_func": 0.833333358168602, "rewards/format_reward_func": 2.2142361402511597, "step": 480 }, { "completion_length": 656.5833435058594, "epoch": 0.09828711256117455, "grad_norm": 1.3363621901974427, "kl": 0.1962890625, "learning_rate": 1.0206851168123076e-07, "loss": 0.0002, "reward": 2.9283682107925415, "reward_std": 0.7495492100715637, "rewards/equation_reward_func": 0.7083333730697632, "rewards/format_reward_func": 2.2200348377227783, "step": 482 }, { "completion_length": 584.3125152587891, "epoch": 0.09869494290375204, "grad_norm": 1.235985667420378, "kl": 0.18017578125, "learning_rate": 9.875563000402948e-08, "loss": 0.0002, "reward": 3.0726042985916138, "reward_std": 0.9599539935588837, "rewards/equation_reward_func": 0.958333358168602, "rewards/format_reward_func": 2.1142709255218506, "step": 484 }, { "completion_length": 602.3958435058594, "epoch": 0.09910277324632953, "grad_norm": 1.352419507127413, "kl": 0.18310546875, "learning_rate": 9.549150281252632e-08, "loss": 0.0002, "reward": 3.627708315849304, "reward_std": 0.8821892440319061, "rewards/equation_reward_func": 1.4166666865348816, "rewards/format_reward_func": 2.211041808128357, "step": 486 }, { "completion_length": 684.2916870117188, "epoch": 0.09951060358890701, "grad_norm": 1.406310592245728, "kl": 0.181640625, "learning_rate": 9.22765267249776e-08, "loss": 0.0002, "reward": 2.766076445579529, "reward_std": 0.6145432703197002, "rewards/equation_reward_func": 0.4583333432674408, "rewards/format_reward_func": 2.3077430725097656, "step": 488 }, { "completion_length": 657.7916870117188, "epoch": 0.0999184339314845, "grad_norm": 1.3318403971159751, "kl": 0.2080078125, "learning_rate": 8.911109238737747e-08, "loss": 0.0002, "reward": 3.1873958110809326, "reward_std": 0.37190073914825916, "rewards/equation_reward_func": 0.8333333730697632, "rewards/format_reward_func": 2.354062557220459, "step": 490 }, { "completion_length": 756.6041870117188, "epoch": 0.100326264274062, "grad_norm": 1.4061978494714438, "kl": 0.20703125, "learning_rate": 8.599558442598998e-08, "loss": 0.0002, "reward": 2.171909749507904, "reward_std": 0.6123473569750786, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 2.171909749507904, "step": 492 }, { "completion_length": 590.2916717529297, "epoch": 0.10073409461663947, "grad_norm": 1.17520751465437, "kl": 0.19677734375, "learning_rate": 8.293038140061515e-08, "loss": 0.0002, "reward": 3.068298816680908, "reward_std": 0.5875828564167023, "rewards/equation_reward_func": 0.9166666865348816, "rewards/format_reward_func": 2.151632070541382, "step": 494 }, { "completion_length": 689.9166870117188, "epoch": 0.10114192495921696, "grad_norm": 1.329822562589213, "kl": 0.19287109375, "learning_rate": 7.991585575858961e-08, "loss": 0.0002, "reward": 2.7232291102409363, "reward_std": 0.9429112374782562, "rewards/equation_reward_func": 0.625, "rewards/format_reward_func": 2.0982291102409363, "step": 496 }, { "completion_length": 644.4791717529297, "epoch": 0.10154975530179446, "grad_norm": 1.7033208401953535, "kl": 0.17529296875, "learning_rate": 7.695237378953224e-08, "loss": 0.0002, "reward": 2.9771876335144043, "reward_std": 0.73157799243927, "rewards/equation_reward_func": 0.7916666865348816, "rewards/format_reward_func": 2.1855210065841675, "step": 498 }, { "completion_length": 737.6041870117188, "epoch": 0.10195758564437195, "grad_norm": 1.713508657462858, "kl": 0.18017578125, "learning_rate": 7.404029558083652e-08, "loss": 0.0002, "reward": 2.4721529483795166, "reward_std": 0.4938492923974991, "rewards/equation_reward_func": 0.0833333358168602, "rewards/format_reward_func": 2.388819456100464, "step": 500 }, { "completion_length": 665.3333740234375, "epoch": 0.10236541598694943, "grad_norm": 1.0340163471089046, "kl": 0.18701171875, "learning_rate": 7.117997497391648e-08, "loss": 0.0002, "reward": 2.9871530532836914, "reward_std": 0.7464583814144135, "rewards/equation_reward_func": 0.7916666865348816, "rewards/format_reward_func": 2.195486307144165, "step": 502 }, { "completion_length": 661.5625305175781, "epoch": 0.10277324632952692, "grad_norm": 1.0282672590274708, "kl": 0.17529296875, "learning_rate": 6.837175952121304e-08, "loss": 0.0002, "reward": 2.9892709255218506, "reward_std": 0.6545587778091431, "rewards/equation_reward_func": 0.7083333730697632, "rewards/format_reward_func": 2.2809375524520874, "step": 504 }, { "completion_length": 788.7500305175781, "epoch": 0.10318107667210441, "grad_norm": 1.2155250881363209, "kl": 0.1943359375, "learning_rate": 6.561599044396288e-08, "loss": 0.0002, "reward": 2.4754514694213867, "reward_std": 1.2856568098068237, "rewards/equation_reward_func": 0.5, "rewards/format_reward_func": 1.9754514694213867, "step": 506 }, { "completion_length": 689.5833435058594, "epoch": 0.10358890701468189, "grad_norm": 1.1923299231333921, "kl": 0.19775390625, "learning_rate": 6.291300259073722e-08, "loss": 0.0002, "reward": 2.958611249923706, "reward_std": 0.8271161913871765, "rewards/equation_reward_func": 0.8333333730697632, "rewards/format_reward_func": 2.125277876853943, "step": 508 }, { "completion_length": 637.3958435058594, "epoch": 0.10399673735725938, "grad_norm": 1.594736749913249, "kl": 0.2021484375, "learning_rate": 6.026312439675551e-08, "loss": 0.0002, "reward": 2.7334723472595215, "reward_std": 0.9248130321502686, "rewards/equation_reward_func": 0.625, "rewards/format_reward_func": 2.1084723472595215, "step": 510 }, { "completion_length": 648.875, "epoch": 0.10440456769983687, "grad_norm": 1.1533792615058107, "kl": 0.18505859375, "learning_rate": 5.7666677843977053e-08, "loss": 0.0002, "reward": 2.86263906955719, "reward_std": 0.8505788147449493, "rewards/equation_reward_func": 0.75, "rewards/format_reward_func": 2.1126389503479004, "step": 512 }, { "completion_length": 619.9166870117188, "epoch": 0.10481239804241435, "grad_norm": 1.1528844064024761, "kl": 0.18896484375, "learning_rate": 5.5123978421978464e-08, "loss": 0.0002, "reward": 3.175590395927429, "reward_std": 0.8909508585929871, "rewards/equation_reward_func": 1.1250000298023224, "rewards/format_reward_func": 2.050590455532074, "step": 514 }, { "completion_length": 652.8541870117188, "epoch": 0.10522022838499184, "grad_norm": 1.026362576623631, "kl": 0.18408203125, "learning_rate": 5.263533508961826e-08, "loss": 0.0002, "reward": 2.9189236164093018, "reward_std": 0.7488152384757996, "rewards/equation_reward_func": 0.7083333730697632, "rewards/format_reward_func": 2.210590362548828, "step": 516 }, { "completion_length": 659.4583435058594, "epoch": 0.10562805872756934, "grad_norm": 1.3341872745914232, "kl": 0.185546875, "learning_rate": 5.0201050237496435e-08, "loss": 0.0002, "reward": 2.915416717529297, "reward_std": 0.785500556230545, "rewards/equation_reward_func": 0.7916666865348816, "rewards/format_reward_func": 2.12375009059906, "step": 518 }, { "completion_length": 724.3333435058594, "epoch": 0.10603588907014681, "grad_norm": 1.1389027986321225, "kl": 0.16650390625, "learning_rate": 4.7821419651211284e-08, "loss": 0.0002, "reward": 2.604514002799988, "reward_std": 0.9652212858200073, "rewards/equation_reward_func": 0.4166666865348816, "rewards/format_reward_func": 2.187847375869751, "step": 520 }, { "completion_length": 616.1666717529297, "epoch": 0.1064437194127243, "grad_norm": 1.6950481305726997, "kl": 0.17822265625, "learning_rate": 4.549673247541874e-08, "loss": 0.0002, "reward": 2.983611226081848, "reward_std": 0.5816805064678192, "rewards/equation_reward_func": 0.75, "rewards/format_reward_func": 2.233611226081848, "step": 522 }, { "completion_length": 786.8541870117188, "epoch": 0.1068515497553018, "grad_norm": 0.9868783610320885, "kl": 0.18701171875, "learning_rate": 4.322727117869951e-08, "loss": 0.0002, "reward": 2.284409761428833, "reward_std": 0.6448712944984436, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 2.284409761428833, "step": 524 }, { "completion_length": 675.7291870117188, "epoch": 0.10725938009787928, "grad_norm": 1.5648006375954522, "kl": 0.1806640625, "learning_rate": 4.1013311519236485e-08, "loss": 0.0002, "reward": 2.776354193687439, "reward_std": 0.9456824660301208, "rewards/equation_reward_func": 0.7083333730697632, "rewards/format_reward_func": 2.0680209398269653, "step": 526 }, { "completion_length": 667.4375305175781, "epoch": 0.10766721044045677, "grad_norm": 1.53346953576534, "kl": 0.1943359375, "learning_rate": 3.8855122511307626e-08, "loss": 0.0002, "reward": 2.9528820514678955, "reward_std": 0.8039775192737579, "rewards/equation_reward_func": 0.7500000409781933, "rewards/format_reward_func": 2.2028820514678955, "step": 528 }, { "completion_length": 630.2083740234375, "epoch": 0.10807504078303426, "grad_norm": 1.5015454500255272, "kl": 0.18505859375, "learning_rate": 3.6752966392599117e-08, "loss": 0.0002, "reward": 2.824340343475342, "reward_std": 0.8222399055957794, "rewards/equation_reward_func": 0.5833333432674408, "rewards/format_reward_func": 2.2410069704055786, "step": 530 }, { "completion_length": 794.3541870117188, "epoch": 0.10848287112561175, "grad_norm": 1.0374860320295616, "kl": 0.17626953125, "learning_rate": 3.470709859234083e-08, "loss": 0.0002, "reward": 2.1905903816223145, "reward_std": 0.7955919802188873, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 2.1905903816223145, "step": 532 }, { "completion_length": 774.3541870117188, "epoch": 0.10889070146818923, "grad_norm": 1.120736957546931, "kl": 0.2001953125, "learning_rate": 3.271776770026963e-08, "loss": 0.0002, "reward": 2.2923611402511597, "reward_std": 0.7566681504249573, "rewards/equation_reward_func": 0.0416666679084301, "rewards/format_reward_func": 2.250694513320923, "step": 534 }, { "completion_length": 631.375, "epoch": 0.10929853181076672, "grad_norm": 1.1960849785247343, "kl": 0.20947265625, "learning_rate": 3.0785215436423985e-08, "loss": 0.0002, "reward": 3.0184723138809204, "reward_std": 0.9326076507568359, "rewards/equation_reward_func": 1.0, "rewards/format_reward_func": 2.0184723138809204, "step": 536 }, { "completion_length": 697.6041870117188, "epoch": 0.10970636215334421, "grad_norm": 1.3324134907829273, "kl": 0.19091796875, "learning_rate": 2.8909676621772848e-08, "loss": 0.0002, "reward": 2.5407986640930176, "reward_std": 0.8522857427597046, "rewards/equation_reward_func": 0.3333333432674408, "rewards/format_reward_func": 2.2074652910232544, "step": 538 }, { "completion_length": 661.75, "epoch": 0.11011419249592169, "grad_norm": 1.4930494245585781, "kl": 0.18701171875, "learning_rate": 2.7091379149682682e-08, "loss": 0.0002, "reward": 2.877777934074402, "reward_std": 0.41776843182742596, "rewards/equation_reward_func": 0.5416666865348816, "rewards/format_reward_func": 2.3361111879348755, "step": 540 }, { "completion_length": 503.8958435058594, "epoch": 0.11052202283849918, "grad_norm": 1.4985848888652291, "kl": 0.310546875, "learning_rate": 2.5330543958227035e-08, "loss": 0.0003, "reward": 3.7885764837265015, "reward_std": 0.5873951315879822, "rewards/equation_reward_func": 1.7083333730697632, "rewards/format_reward_func": 2.0802430510520935, "step": 542 }, { "completion_length": 812.0416870117188, "epoch": 0.11092985318107668, "grad_norm": 1.0755426609459213, "kl": 0.16943359375, "learning_rate": 2.362738500334055e-08, "loss": 0.0002, "reward": 2.2252084016799927, "reward_std": 0.7724728882312775, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 2.2252084016799927, "step": 544 }, { "completion_length": 589.3541717529297, "epoch": 0.11133768352365415, "grad_norm": 1.2868495738115369, "kl": 0.154296875, "learning_rate": 2.1982109232821176e-08, "loss": 0.0002, "reward": 3.426076292991638, "reward_std": 0.8766676485538483, "rewards/equation_reward_func": 1.2083333432674408, "rewards/format_reward_func": 2.217743158340454, "step": 546 }, { "completion_length": 697.1041870117188, "epoch": 0.11174551386623165, "grad_norm": 1.5843230276459015, "kl": 0.17822265625, "learning_rate": 2.0394916561185084e-08, "loss": 0.0002, "reward": 2.429861068725586, "reward_std": 0.6437118351459503, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 2.263194441795349, "step": 548 }, { "completion_length": 620.0416870117188, "epoch": 0.11215334420880914, "grad_norm": 1.43390485144146, "kl": 0.2021484375, "learning_rate": 1.8865999845374792e-08, "loss": 0.0002, "reward": 2.9577430486679077, "reward_std": 0.7144142985343933, "rewards/equation_reward_func": 0.9166666865348816, "rewards/format_reward_func": 2.0410765409469604, "step": 550 }, { "completion_length": 591.4375, "epoch": 0.11256117455138662, "grad_norm": 1.772375367915741, "kl": 0.26416015625, "learning_rate": 1.7395544861325718e-08, "loss": 0.0003, "reward": 3.147847294807434, "reward_std": 0.8900530934333801, "rewards/equation_reward_func": 0.958333358168602, "rewards/format_reward_func": 2.189513921737671, "step": 552 }, { "completion_length": 644.6666717529297, "epoch": 0.11296900489396411, "grad_norm": 1.0011090616007174, "kl": 0.15673828125, "learning_rate": 1.598373028139266e-08, "loss": 0.0002, "reward": 2.8859028816223145, "reward_std": 0.8658215999603271, "rewards/equation_reward_func": 0.75, "rewards/format_reward_func": 2.1359028220176697, "step": 554 }, { "completion_length": 802.8541870117188, "epoch": 0.1133768352365416, "grad_norm": 1.0859783026177827, "kl": 0.1611328125, "learning_rate": 1.4630727652640007e-08, "loss": 0.0002, "reward": 2.3715277910232544, "reward_std": 0.9544045031070709, "rewards/equation_reward_func": 0.1666666716337204, "rewards/format_reward_func": 2.204861283302307, "step": 556 }, { "completion_length": 653.8958435058594, "epoch": 0.1137846655791191, "grad_norm": 1.324673649956278, "kl": 0.189453125, "learning_rate": 1.3336701375997127e-08, "loss": 0.0002, "reward": 3.0635764598846436, "reward_std": 0.6987862586975098, "rewards/equation_reward_func": 0.75, "rewards/format_reward_func": 2.313576579093933, "step": 558 }, { "completion_length": 615.7708435058594, "epoch": 0.11419249592169657, "grad_norm": 1.9720955384622545, "kl": 0.1845703125, "learning_rate": 1.2101808686282189e-08, "loss": 0.0002, "reward": 2.958263874053955, "reward_std": 0.5063729882240295, "rewards/equation_reward_func": 0.7083333730697632, "rewards/format_reward_func": 2.2499306201934814, "step": 560 }, { "completion_length": 644.4166870117188, "epoch": 0.11460032626427406, "grad_norm": 0.9932246904810618, "kl": 0.1787109375, "learning_rate": 1.0926199633097154e-08, "loss": 0.0002, "reward": 2.9698264598846436, "reward_std": 0.7344387173652649, "rewards/equation_reward_func": 0.7916666865348816, "rewards/format_reward_func": 2.1781598329544067, "step": 562 }, { "completion_length": 647.9166870117188, "epoch": 0.11500815660685156, "grad_norm": 1.3817710664807803, "kl": 0.1806640625, "learning_rate": 9.810017062595321e-09, "loss": 0.0002, "reward": 2.9287848472595215, "reward_std": 0.8746606707572937, "rewards/equation_reward_func": 0.75, "rewards/format_reward_func": 2.1787847876548767, "step": 564 }, { "completion_length": 471.43751525878906, "epoch": 0.11541598694942903, "grad_norm": 1.3352359323731087, "kl": 0.19482421875, "learning_rate": 8.753396600124252e-09, "loss": 0.0002, "reward": 3.6786112785339355, "reward_std": 0.8317141830921173, "rewards/equation_reward_func": 1.7083333730697632, "rewards/format_reward_func": 1.9702778458595276, "step": 566 }, { "completion_length": 744.6041870117188, "epoch": 0.11582381729200653, "grad_norm": 1.4277278138028093, "kl": 0.1787109375, "learning_rate": 7.756466633746406e-09, "loss": 0.0002, "reward": 2.5164932012557983, "reward_std": 0.5315538048744202, "rewards/equation_reward_func": 0.125, "rewards/format_reward_func": 2.391493082046509, "step": 568 }, { "completion_length": 658.3333435058594, "epoch": 0.11623164763458402, "grad_norm": 1.1591152772878843, "kl": 0.189453125, "learning_rate": 6.819348298638839e-09, "loss": 0.0002, "reward": 2.8295485973358154, "reward_std": 0.7308537364006042, "rewards/equation_reward_func": 0.5833333730697632, "rewards/format_reward_func": 2.246215343475342, "step": 570 }, { "completion_length": 578.1875305175781, "epoch": 0.1166394779771615, "grad_norm": 1.4699911779642927, "kl": 0.21044921875, "learning_rate": 5.942155462374199e-09, "loss": 0.0002, "reward": 3.107847213745117, "reward_std": 0.4525897800922394, "rewards/equation_reward_func": 0.9583333730697632, "rewards/format_reward_func": 2.149513900279999, "step": 572 }, { "completion_length": 575.4375305175781, "epoch": 0.11704730831973899, "grad_norm": 1.4198721859673429, "kl": 0.19775390625, "learning_rate": 5.1249947110849626e-09, "loss": 0.0002, "reward": 3.4872570037841797, "reward_std": 0.8599075376987457, "rewards/equation_reward_func": 1.2916666865348816, "rewards/format_reward_func": 2.195590376853943, "step": 574 }, { "completion_length": 705.6041870117188, "epoch": 0.11745513866231648, "grad_norm": 1.4989903624292338, "kl": 0.1923828125, "learning_rate": 4.367965336512403e-09, "loss": 0.0002, "reward": 2.398923635482788, "reward_std": 0.3067256808280945, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 2.398923873901367, "step": 576 }, { "completion_length": 755.6875305175781, "epoch": 0.11786296900489396, "grad_norm": 1.1452197538999578, "kl": 0.1962890625, "learning_rate": 3.671159323941797e-09, "loss": 0.0002, "reward": 2.24670147895813, "reward_std": 0.6733859181404114, "rewards/equation_reward_func": 0.0, "rewards/format_reward_func": 2.24670147895813, "step": 578 }, { "completion_length": 664.6666870117188, "epoch": 0.11827079934747145, "grad_norm": 1.2449058428646427, "kl": 0.197265625, "learning_rate": 3.0346613410252574e-09, "loss": 0.0002, "reward": 3.29194438457489, "reward_std": 0.9043855667114258, "rewards/equation_reward_func": 1.0, "rewards/format_reward_func": 2.29194438457489, "step": 580 }, { "completion_length": 617.7291717529297, "epoch": 0.11867862969004894, "grad_norm": 1.6256483463504106, "kl": 0.23583984375, "learning_rate": 2.458548727494292e-09, "loss": 0.0002, "reward": 3.077187657356262, "reward_std": 0.5232462398707867, "rewards/equation_reward_func": 0.875, "rewards/format_reward_func": 2.2021875381469727, "step": 582 }, { "completion_length": 562.9791870117188, "epoch": 0.11908646003262642, "grad_norm": 1.3493720634814992, "kl": 0.20654296875, "learning_rate": 1.942891485762044e-09, "loss": 0.0002, "reward": 3.354375123977661, "reward_std": 0.5960628092288971, "rewards/equation_reward_func": 1.1666666865348816, "rewards/format_reward_func": 2.1877083778381348, "step": 584 }, { "completion_length": 643.0625305175781, "epoch": 0.11949429037520391, "grad_norm": 1.4429671637976917, "kl": 0.1787109375, "learning_rate": 1.4877522724175972e-09, "loss": 0.0002, "reward": 2.989027738571167, "reward_std": 1.0448077917099, "rewards/equation_reward_func": 0.8333333730697632, "rewards/format_reward_func": 2.1556944847106934, "step": 586 }, { "completion_length": 545.5208435058594, "epoch": 0.1199021207177814, "grad_norm": 1.3733935240774355, "kl": 0.18994140625, "learning_rate": 1.0931863906127325e-09, "loss": 0.0002, "reward": 3.299618124961853, "reward_std": 1.0864940881729126, "rewards/equation_reward_func": 1.1250000596046448, "rewards/format_reward_func": 2.174618124961853, "step": 588 }, { "completion_length": 597.1666870117188, "epoch": 0.1203099510603589, "grad_norm": 1.2952890180323302, "kl": 0.20654296875, "learning_rate": 7.592417833419129e-10, "loss": 0.0002, "reward": 3.2572569847106934, "reward_std": 0.8164662718772888, "rewards/equation_reward_func": 1.0833333730697632, "rewards/format_reward_func": 2.1739237308502197, "step": 590 }, { "completion_length": 557.7083435058594, "epoch": 0.12071778140293637, "grad_norm": 1.419396729242162, "kl": 0.18310546875, "learning_rate": 4.859590276170556e-10, "loss": 0.0002, "reward": 3.1976042985916138, "reward_std": 0.1676994524896145, "rewards/equation_reward_func": 1.0, "rewards/format_reward_func": 2.197604179382324, "step": 592 }, { "completion_length": 640.6666870117188, "epoch": 0.12112561174551387, "grad_norm": 1.7175944027716497, "kl": 0.21728515625, "learning_rate": 2.733713295369755e-10, "loss": 0.0002, "reward": 2.8080209493637085, "reward_std": 0.7391078174114227, "rewards/equation_reward_func": 0.5416666865348816, "rewards/format_reward_func": 2.266354203224182, "step": 594 }, { "completion_length": 658.6458435058594, "epoch": 0.12153344208809136, "grad_norm": 1.1130542587727132, "kl": 0.17919921875, "learning_rate": 1.215045202527243e-10, "loss": 0.0002, "reward": 2.9176390171051025, "reward_std": 0.8149993717670441, "rewards/equation_reward_func": 0.6666666865348816, "rewards/format_reward_func": 2.250972270965576, "step": 596 }, { "completion_length": 666.5416870117188, "epoch": 0.12194127243066884, "grad_norm": 1.3351873357408293, "kl": 0.1982421875, "learning_rate": 3.037705282848968e-11, "loss": 0.0002, "reward": 2.8039932250976562, "reward_std": 0.471679862588644, "rewards/equation_reward_func": 0.4583333432674408, "rewards/format_reward_func": 2.345659852027893, "step": 598 }, { "completion_length": 642.3125305175781, "epoch": 0.12234910277324633, "grad_norm": 1.3356459953636426, "kl": 0.21044921875, "learning_rate": 0.0, "loss": 0.0002, "reward": 2.9183679819107056, "reward_std": 0.771289050579071, "rewards/equation_reward_func": 0.75, "rewards/format_reward_func": 2.168368101119995, "step": 600 }, { "epoch": 0.12234910277324633, "step": 600, "total_flos": 0.0, "train_loss": 0.00014642298419068685, "train_runtime": 10716.9446, "train_samples_per_second": 1.344, "train_steps_per_second": 0.056 } ], "logging_steps": 2, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }