| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.12234910277324633, | |
| "eval_steps": 500, | |
| "global_step": 600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 479.9791717529297, | |
| "epoch": 0.0004078303425774878, | |
| "grad_norm": 3.6416013248561003, | |
| "kl": 0.0, | |
| "learning_rate": 6.666666666666667e-08, | |
| "loss": 0.0, | |
| "reward": 0.14017362147569656, | |
| "reward_std": 0.433171808719635, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 0.09850695356726646, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 370.3333435058594, | |
| "epoch": 0.0008156606851549756, | |
| "grad_norm": 2.7919516711454913, | |
| "kl": 0.00021123886108398438, | |
| "learning_rate": 1.3333333333333334e-07, | |
| "loss": 0.0, | |
| "reward": 0.36586807668209076, | |
| "reward_std": 0.5816957801580429, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 0.32420141994953156, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 381.8958435058594, | |
| "epoch": 0.0012234910277324632, | |
| "grad_norm": 2.1831646783127723, | |
| "kl": 0.00020503997802734375, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0, | |
| "reward": 0.19583334028720856, | |
| "reward_std": 0.49165327847003937, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 0.19583334028720856, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 370.5625, | |
| "epoch": 0.0016313213703099511, | |
| "grad_norm": 2.7704350046014077, | |
| "kl": 0.00017118453979492188, | |
| "learning_rate": 2.6666666666666667e-07, | |
| "loss": 0.0, | |
| "reward": 0.36277779191732407, | |
| "reward_std": 0.5977305769920349, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 0.3627777770161629, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 463.37501525878906, | |
| "epoch": 0.0020391517128874386, | |
| "grad_norm": 2.123991976170542, | |
| "kl": 0.00017118453979492188, | |
| "learning_rate": 3.333333333333333e-07, | |
| "loss": 0.0, | |
| "reward": 0.29836806654930115, | |
| "reward_std": 0.6623781323432922, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 0.25670139491558075, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 421.93751525878906, | |
| "epoch": 0.0024469820554649264, | |
| "grad_norm": 2.878424655232733, | |
| "kl": 0.00017547607421875, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0, | |
| "reward": 0.3968055695295334, | |
| "reward_std": 0.63937908411026, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 0.3968055695295334, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 475.3541717529297, | |
| "epoch": 0.0028548123980424145, | |
| "grad_norm": 2.4896630378314084, | |
| "kl": 0.00018596649169921875, | |
| "learning_rate": 4.6666666666666666e-07, | |
| "loss": 0.0, | |
| "reward": 0.24850694835186005, | |
| "reward_std": 0.5338329374790192, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 0.24850695580244064, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 416.6666717529297, | |
| "epoch": 0.0032626427406199023, | |
| "grad_norm": 2.09352394131462, | |
| "kl": 0.0005130767822265625, | |
| "learning_rate": 5.333333333333333e-07, | |
| "loss": 0.0, | |
| "reward": 0.33868058025836945, | |
| "reward_std": 0.6486604511737823, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 0.29701392352581024, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 415.00001525878906, | |
| "epoch": 0.00367047308319739, | |
| "grad_norm": 2.0112836386100708, | |
| "kl": 0.0008068084716796875, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0, | |
| "reward": 0.25767362117767334, | |
| "reward_std": 0.5489525943994522, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 0.21600694954395294, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 297.3958435058594, | |
| "epoch": 0.004078303425774877, | |
| "grad_norm": 2.8663136385547197, | |
| "kl": 0.001346588134765625, | |
| "learning_rate": 6.666666666666666e-07, | |
| "loss": 0.0, | |
| "reward": 0.5803819894790649, | |
| "reward_std": 0.7896733283996582, | |
| "rewards/equation_reward_func": 0.1250000037252903, | |
| "rewards/format_reward_func": 0.45538195967674255, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 399.54168701171875, | |
| "epoch": 0.004486133768352365, | |
| "grad_norm": 2.697624014237005, | |
| "kl": 0.00267791748046875, | |
| "learning_rate": 7.333333333333332e-07, | |
| "loss": 0.0, | |
| "reward": 0.4483680725097656, | |
| "reward_std": 0.5992304682731628, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 0.44836805760860443, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 284.75001525878906, | |
| "epoch": 0.004893964110929853, | |
| "grad_norm": 2.5548177263979235, | |
| "kl": 0.0041961669921875, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0, | |
| "reward": 0.5757291615009308, | |
| "reward_std": 0.647213488817215, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 0.5757291913032532, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 383.0833435058594, | |
| "epoch": 0.005301794453507341, | |
| "grad_norm": 2.5987870166821843, | |
| "kl": 0.00470733642578125, | |
| "learning_rate": 8.666666666666667e-07, | |
| "loss": 0.0, | |
| "reward": 0.552534744143486, | |
| "reward_std": 0.7489242553710938, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 0.5108680874109268, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 310.5, | |
| "epoch": 0.005709624796084829, | |
| "grad_norm": 2.3713758660284148, | |
| "kl": 0.006378173828125, | |
| "learning_rate": 9.333333333333333e-07, | |
| "loss": 0.0, | |
| "reward": 0.8682639002799988, | |
| "reward_std": 0.9391801357269287, | |
| "rewards/equation_reward_func": 0.2083333432674408, | |
| "rewards/format_reward_func": 0.6599305868148804, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 259.1666717529297, | |
| "epoch": 0.006117455138662317, | |
| "grad_norm": 1.9481870816169569, | |
| "kl": 0.0147705078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 0.7582292258739471, | |
| "reward_std": 0.7113883793354034, | |
| "rewards/equation_reward_func": 0.0833333358168602, | |
| "rewards/format_reward_func": 0.674895852804184, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 294.66668701171875, | |
| "epoch": 0.0065252854812398045, | |
| "grad_norm": 2.8753021397818284, | |
| "kl": 0.010467529296875, | |
| "learning_rate": 9.999696229471714e-07, | |
| "loss": 0.0, | |
| "reward": 0.931770920753479, | |
| "reward_std": 0.7402721643447876, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 0.890104204416275, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 190.70834350585938, | |
| "epoch": 0.006933115823817292, | |
| "grad_norm": 3.149911745984124, | |
| "kl": 0.0218505859375, | |
| "learning_rate": 9.998784954797472e-07, | |
| "loss": 0.0, | |
| "reward": 1.0088889300823212, | |
| "reward_std": 0.7540942430496216, | |
| "rewards/equation_reward_func": 0.0833333358168602, | |
| "rewards/format_reward_func": 0.9255555272102356, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 246.4166717529297, | |
| "epoch": 0.00734094616639478, | |
| "grad_norm": 3.153714167577427, | |
| "kl": 0.01788330078125, | |
| "learning_rate": 9.99726628670463e-07, | |
| "loss": 0.0, | |
| "reward": 1.202048659324646, | |
| "reward_std": 1.045266568660736, | |
| "rewards/equation_reward_func": 0.2500000074505806, | |
| "rewards/format_reward_func": 0.9520486295223236, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 275.31251525878906, | |
| "epoch": 0.007748776508972268, | |
| "grad_norm": 0.9481466466680466, | |
| "kl": 0.013427734375, | |
| "learning_rate": 9.995140409723828e-07, | |
| "loss": 0.0, | |
| "reward": 1.1351736187934875, | |
| "reward_std": 0.5596802830696106, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.1351736187934875, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 188.5416717529297, | |
| "epoch": 0.008156606851549755, | |
| "grad_norm": 1.4742456865490186, | |
| "kl": 0.02484130859375, | |
| "learning_rate": 9.99240758216658e-07, | |
| "loss": 0.0, | |
| "reward": 1.0687847137451172, | |
| "reward_std": 0.6074499785900116, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 1.0271181166172028, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 199.70833587646484, | |
| "epoch": 0.008564437194127243, | |
| "grad_norm": 1.915063750564903, | |
| "kl": 0.032958984375, | |
| "learning_rate": 9.989068136093872e-07, | |
| "loss": 0.0, | |
| "reward": 1.1395833194255829, | |
| "reward_std": 0.5386238098144531, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 1.0979166626930237, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 267.8333435058594, | |
| "epoch": 0.00897226753670473, | |
| "grad_norm": 1.3838672018586455, | |
| "kl": 0.02752685546875, | |
| "learning_rate": 9.985122477275824e-07, | |
| "loss": 0.0, | |
| "reward": 1.1198958158493042, | |
| "reward_std": 0.4947269856929779, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.119895875453949, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 198.3541717529297, | |
| "epoch": 0.009380097879282219, | |
| "grad_norm": 2.65001960202523, | |
| "kl": 0.0621337890625, | |
| "learning_rate": 9.98057108514238e-07, | |
| "loss": 0.0001, | |
| "reward": 1.3591667413711548, | |
| "reward_std": 0.5990243405103683, | |
| "rewards/equation_reward_func": 0.1666666716337204, | |
| "rewards/format_reward_func": 1.1924999952316284, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 180.14583587646484, | |
| "epoch": 0.009787928221859706, | |
| "grad_norm": 2.273310138239933, | |
| "kl": 0.02679443359375, | |
| "learning_rate": 9.975414512725056e-07, | |
| "loss": 0.0, | |
| "reward": 1.4479514360427856, | |
| "reward_std": 0.5268709659576416, | |
| "rewards/equation_reward_func": 0.1666666716337204, | |
| "rewards/format_reward_func": 1.281284749507904, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 297.7291793823242, | |
| "epoch": 0.010195758564437194, | |
| "grad_norm": 1.5712495679827763, | |
| "kl": 0.02593994140625, | |
| "learning_rate": 9.969653386589747e-07, | |
| "loss": 0.0, | |
| "reward": 1.27177095413208, | |
| "reward_std": 0.6252816617488861, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 1.2301042079925537, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 194.5416717529297, | |
| "epoch": 0.010603588907014683, | |
| "grad_norm": 2.201493729261743, | |
| "kl": 0.03179931640625, | |
| "learning_rate": 9.963288406760582e-07, | |
| "loss": 0.0, | |
| "reward": 1.5546875, | |
| "reward_std": 0.7862544655799866, | |
| "rewards/equation_reward_func": 0.25, | |
| "rewards/format_reward_func": 1.3046875596046448, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 294.8958435058594, | |
| "epoch": 0.01101141924959217, | |
| "grad_norm": 1.093662160894823, | |
| "kl": 0.0328369140625, | |
| "learning_rate": 9.956320346634875e-07, | |
| "loss": 0.0, | |
| "reward": 1.2887500524520874, | |
| "reward_std": 0.49396970868110657, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.2887500524520874, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 300.1458435058594, | |
| "epoch": 0.011419249592169658, | |
| "grad_norm": 0.8578612513997826, | |
| "kl": 0.03271484375, | |
| "learning_rate": 9.94875005288915e-07, | |
| "loss": 0.0, | |
| "reward": 1.1602779626846313, | |
| "reward_std": 0.5292092859745026, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.1602778434753418, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 299.7291717529297, | |
| "epoch": 0.011827079934747145, | |
| "grad_norm": 1.6716454246814316, | |
| "kl": 0.032470703125, | |
| "learning_rate": 9.940578445376257e-07, | |
| "loss": 0.0, | |
| "reward": 1.3559028506278992, | |
| "reward_std": 0.7161896526813507, | |
| "rewards/equation_reward_func": 0.1250000037252903, | |
| "rewards/format_reward_func": 1.2309028506278992, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 366.50001525878906, | |
| "epoch": 0.012234910277324634, | |
| "grad_norm": 1.2542596393593248, | |
| "kl": 0.026611328125, | |
| "learning_rate": 9.931806517013612e-07, | |
| "loss": 0.0, | |
| "reward": 1.2951388955116272, | |
| "reward_std": 0.6559239327907562, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.2951388955116272, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 267.37501525878906, | |
| "epoch": 0.01264274061990212, | |
| "grad_norm": 1.951739626954709, | |
| "kl": 0.03289794921875, | |
| "learning_rate": 9.922435333662535e-07, | |
| "loss": 0.0, | |
| "reward": 1.624826431274414, | |
| "reward_std": 0.869500607252121, | |
| "rewards/equation_reward_func": 0.2916666716337204, | |
| "rewards/format_reward_func": 1.3331597447395325, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 252.1041717529297, | |
| "epoch": 0.013050570962479609, | |
| "grad_norm": 1.7631361729865684, | |
| "kl": 0.0838623046875, | |
| "learning_rate": 9.912466033998757e-07, | |
| "loss": 0.0001, | |
| "reward": 1.5268749594688416, | |
| "reward_std": 0.7954416573047638, | |
| "rewards/equation_reward_func": 0.2916666679084301, | |
| "rewards/format_reward_func": 1.2352083921432495, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 266.2291717529297, | |
| "epoch": 0.013458401305057096, | |
| "grad_norm": 1.086442325424048, | |
| "kl": 0.0350341796875, | |
| "learning_rate": 9.901899829374047e-07, | |
| "loss": 0.0, | |
| "reward": 1.3218055367469788, | |
| "reward_std": 0.43308839201927185, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.3218055367469788, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 270.62501525878906, | |
| "epoch": 0.013866231647634585, | |
| "grad_norm": 0.7757080705779537, | |
| "kl": 0.03179931640625, | |
| "learning_rate": 9.890738003669027e-07, | |
| "loss": 0.0, | |
| "reward": 1.440381944179535, | |
| "reward_std": 0.36590851843357086, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.440381944179535, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 361.7291717529297, | |
| "epoch": 0.014274061990212071, | |
| "grad_norm": 0.9196489537714668, | |
| "kl": 0.105224609375, | |
| "learning_rate": 9.878981913137177e-07, | |
| "loss": 0.0001, | |
| "reward": 1.234375, | |
| "reward_std": 0.5996429324150085, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.2343750596046448, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 324.6041717529297, | |
| "epoch": 0.01468189233278956, | |
| "grad_norm": 1.3537086792376973, | |
| "kl": 0.037841796875, | |
| "learning_rate": 9.866632986240029e-07, | |
| "loss": 0.0, | |
| "reward": 1.4478819966316223, | |
| "reward_std": 0.447622686624527, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.4478819966316223, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 288.00001525878906, | |
| "epoch": 0.015089722675367047, | |
| "grad_norm": 0.9416399145929761, | |
| "kl": 0.0350341796875, | |
| "learning_rate": 9.853692723473598e-07, | |
| "loss": 0.0, | |
| "reward": 1.6369444727897644, | |
| "reward_std": 0.5435648560523987, | |
| "rewards/equation_reward_func": 0.125, | |
| "rewards/format_reward_func": 1.5119444727897644, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 324.8958435058594, | |
| "epoch": 0.015497553017944535, | |
| "grad_norm": 1.4608277452795702, | |
| "kl": 0.0404052734375, | |
| "learning_rate": 9.840162697186074e-07, | |
| "loss": 0.0, | |
| "reward": 1.5702083706855774, | |
| "reward_std": 0.8935641050338745, | |
| "rewards/equation_reward_func": 0.3333333432674408, | |
| "rewards/format_reward_func": 1.236875057220459, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 290.56250762939453, | |
| "epoch": 0.015905383360522024, | |
| "grad_norm": 0.7407367075365319, | |
| "kl": 0.0458984375, | |
| "learning_rate": 9.826044551386742e-07, | |
| "loss": 0.0, | |
| "reward": 1.5788541436195374, | |
| "reward_std": 0.5843808948993683, | |
| "rewards/equation_reward_func": 0.125, | |
| "rewards/format_reward_func": 1.4538542032241821, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 362.68751525878906, | |
| "epoch": 0.01631321370309951, | |
| "grad_norm": 1.5661374163452022, | |
| "kl": 0.0408935546875, | |
| "learning_rate": 9.811340001546251e-07, | |
| "loss": 0.0, | |
| "reward": 1.6298264265060425, | |
| "reward_std": 0.8494586944580078, | |
| "rewards/equation_reward_func": 0.25, | |
| "rewards/format_reward_func": 1.3798264265060425, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 451.125, | |
| "epoch": 0.016721044045676998, | |
| "grad_norm": 1.0244455960613406, | |
| "kl": 0.0401611328125, | |
| "learning_rate": 9.79605083438815e-07, | |
| "loss": 0.0, | |
| "reward": 1.2677431106567383, | |
| "reward_std": 0.6154287457466125, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.2677430510520935, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 308.6458435058594, | |
| "epoch": 0.017128874388254486, | |
| "grad_norm": 1.3216564030922435, | |
| "kl": 0.0445556640625, | |
| "learning_rate": 9.780178907671788e-07, | |
| "loss": 0.0, | |
| "reward": 1.554166853427887, | |
| "reward_std": 0.5582673996686935, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 1.5125000476837158, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 374.60418701171875, | |
| "epoch": 0.017536704730831975, | |
| "grad_norm": 1.4468753547015691, | |
| "kl": 0.0377197265625, | |
| "learning_rate": 9.763726149966595e-07, | |
| "loss": 0.0, | |
| "reward": 1.77156263589859, | |
| "reward_std": 0.9427327811717987, | |
| "rewards/equation_reward_func": 0.3750000111758709, | |
| "rewards/format_reward_func": 1.3965625166893005, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 404.06251525878906, | |
| "epoch": 0.01794453507340946, | |
| "grad_norm": 1.3417802102005347, | |
| "kl": 0.0430908203125, | |
| "learning_rate": 9.74669456041773e-07, | |
| "loss": 0.0, | |
| "reward": 1.4073264598846436, | |
| "reward_std": 0.6282331496477127, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 1.365659773349762, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 442.625, | |
| "epoch": 0.01835236541598695, | |
| "grad_norm": 1.5147263647387277, | |
| "kl": 0.0496826171875, | |
| "learning_rate": 9.729086208503173e-07, | |
| "loss": 0.0, | |
| "reward": 1.4621528387069702, | |
| "reward_std": 0.8307860195636749, | |
| "rewards/equation_reward_func": 0.125, | |
| "rewards/format_reward_func": 1.3371528387069702, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 333.3541793823242, | |
| "epoch": 0.018760195758564437, | |
| "grad_norm": 1.754991435054013, | |
| "kl": 0.049560546875, | |
| "learning_rate": 9.710903233782272e-07, | |
| "loss": 0.0, | |
| "reward": 1.9437847137451172, | |
| "reward_std": 0.8394620716571808, | |
| "rewards/equation_reward_func": 0.5, | |
| "rewards/format_reward_func": 1.4437847137451172, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 381.4583435058594, | |
| "epoch": 0.019168026101141926, | |
| "grad_norm": 0.8810621605534877, | |
| "kl": 0.048828125, | |
| "learning_rate": 9.69214784563576e-07, | |
| "loss": 0.0, | |
| "reward": 1.7743055820465088, | |
| "reward_std": 0.7846577763557434, | |
| "rewards/equation_reward_func": 0.25, | |
| "rewards/format_reward_func": 1.5243056416511536, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 413.3958435058594, | |
| "epoch": 0.01957585644371941, | |
| "grad_norm": 1.1981506775594148, | |
| "kl": 0.0509033203125, | |
| "learning_rate": 9.672822322997304e-07, | |
| "loss": 0.0001, | |
| "reward": 1.5128472447395325, | |
| "reward_std": 0.724719375371933, | |
| "rewards/equation_reward_func": 0.0833333358168602, | |
| "rewards/format_reward_func": 1.429513931274414, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 408.68751525878906, | |
| "epoch": 0.0199836867862969, | |
| "grad_norm": 0.8127416399915954, | |
| "kl": 0.0482177734375, | |
| "learning_rate": 9.652929014076592e-07, | |
| "loss": 0.0, | |
| "reward": 1.4740972518920898, | |
| "reward_std": 0.581254854798317, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.4740972518920898, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 401.125, | |
| "epoch": 0.020391517128874388, | |
| "grad_norm": 0.9832509648546535, | |
| "kl": 0.04248046875, | |
| "learning_rate": 9.632470336074007e-07, | |
| "loss": 0.0, | |
| "reward": 1.5040277242660522, | |
| "reward_std": 0.6817552745342255, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 1.4623610973358154, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 434.9583435058594, | |
| "epoch": 0.020799347471451877, | |
| "grad_norm": 1.430611768368398, | |
| "kl": 0.0458984375, | |
| "learning_rate": 9.611448774886923e-07, | |
| "loss": 0.0, | |
| "reward": 1.833784818649292, | |
| "reward_std": 0.832920491695404, | |
| "rewards/equation_reward_func": 0.25, | |
| "rewards/format_reward_func": 1.5837848782539368, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 368.8958435058594, | |
| "epoch": 0.021207177814029365, | |
| "grad_norm": 1.681849348882777, | |
| "kl": 0.048828125, | |
| "learning_rate": 9.589866884807634e-07, | |
| "loss": 0.0, | |
| "reward": 2.02239590883255, | |
| "reward_std": 1.069144368171692, | |
| "rewards/equation_reward_func": 0.5416666865348816, | |
| "rewards/format_reward_func": 1.4807292222976685, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 417.60418701171875, | |
| "epoch": 0.02161500815660685, | |
| "grad_norm": 1.5460934994217617, | |
| "kl": 0.0577392578125, | |
| "learning_rate": 9.567727288213004e-07, | |
| "loss": 0.0001, | |
| "reward": 1.7238194942474365, | |
| "reward_std": 0.8790097832679749, | |
| "rewards/equation_reward_func": 0.2916666865348816, | |
| "rewards/format_reward_func": 1.432152807712555, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 367.2916717529297, | |
| "epoch": 0.02202283849918434, | |
| "grad_norm": 0.8154580597171138, | |
| "kl": 0.079833984375, | |
| "learning_rate": 9.545032675245813e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6206597089767456, | |
| "reward_std": 0.5355260521173477, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 1.578993022441864, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 378.4375, | |
| "epoch": 0.022430668841761828, | |
| "grad_norm": 0.8080191505191605, | |
| "kl": 2.8504638671875, | |
| "learning_rate": 9.521785803487888e-07, | |
| "loss": 0.0029, | |
| "reward": 1.6229513883590698, | |
| "reward_std": 0.5469937920570374, | |
| "rewards/equation_reward_func": 0.0833333358168602, | |
| "rewards/format_reward_func": 1.5396180748939514, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 353.7083435058594, | |
| "epoch": 0.022838499184339316, | |
| "grad_norm": 2.0675711092750553, | |
| "kl": 0.0582275390625, | |
| "learning_rate": 9.497989497625034e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9050694108009338, | |
| "reward_std": 0.8356568217277527, | |
| "rewards/equation_reward_func": 0.3750000111758709, | |
| "rewards/format_reward_func": 1.5300694704055786, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 396.3958435058594, | |
| "epoch": 0.0232463295269168, | |
| "grad_norm": 1.7722424779735066, | |
| "kl": 0.0474853515625, | |
| "learning_rate": 9.473646649103817e-07, | |
| "loss": 0.0, | |
| "reward": 1.8715277910232544, | |
| "reward_std": 0.7635601460933685, | |
| "rewards/equation_reward_func": 0.2916666865348816, | |
| "rewards/format_reward_func": 1.579861044883728, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 392.25001525878906, | |
| "epoch": 0.02365415986949429, | |
| "grad_norm": 1.4500449746136268, | |
| "kl": 0.052490234375, | |
| "learning_rate": 9.448760215780216e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8360764980316162, | |
| "reward_std": 0.8432624340057373, | |
| "rewards/equation_reward_func": 0.3333333432674408, | |
| "rewards/format_reward_func": 1.502743124961853, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 445.37501525878906, | |
| "epoch": 0.02406199021207178, | |
| "grad_norm": 1.0245921035180448, | |
| "kl": 0.046142578125, | |
| "learning_rate": 9.423333221560229e-07, | |
| "loss": 0.0, | |
| "reward": 1.8455902934074402, | |
| "reward_std": 0.6104674339294434, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 1.8039236664772034, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 350.00001525878906, | |
| "epoch": 0.024469820554649267, | |
| "grad_norm": 1.0559535296932554, | |
| "kl": 0.0653076171875, | |
| "learning_rate": 9.397368756032444e-07, | |
| "loss": 0.0001, | |
| "reward": 2.0223612189292908, | |
| "reward_std": 0.8122723698616028, | |
| "rewards/equation_reward_func": 0.4166666679084301, | |
| "rewards/format_reward_func": 1.6056944131851196, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 430.6041717529297, | |
| "epoch": 0.024877650897226752, | |
| "grad_norm": 1.2128128464325674, | |
| "kl": 0.052978515625, | |
| "learning_rate": 9.370869974092628e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6623265147209167, | |
| "reward_std": 0.6956472098827362, | |
| "rewards/equation_reward_func": 0.0833333358168602, | |
| "rewards/format_reward_func": 1.5789931416511536, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 430.62501525878906, | |
| "epoch": 0.02528548123980424, | |
| "grad_norm": 1.3439878279377437, | |
| "kl": 0.0567626953125, | |
| "learning_rate": 9.343840095560371e-07, | |
| "loss": 0.0001, | |
| "reward": 1.6742013692855835, | |
| "reward_std": 0.8052680194377899, | |
| "rewards/equation_reward_func": 0.1666666716337204, | |
| "rewards/format_reward_func": 1.5075347423553467, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 450.1666717529297, | |
| "epoch": 0.02569331158238173, | |
| "grad_norm": 1.6330050200150412, | |
| "kl": 0.05224609375, | |
| "learning_rate": 9.316282404787869e-07, | |
| "loss": 0.0001, | |
| "reward": 1.7786458730697632, | |
| "reward_std": 0.5853727161884308, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 1.7369791865348816, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 489.02085876464844, | |
| "epoch": 0.026101141924959218, | |
| "grad_norm": 0.7871177276009865, | |
| "kl": 0.051025390625, | |
| "learning_rate": 9.288200250260834e-07, | |
| "loss": 0.0001, | |
| "reward": 1.7109723091125488, | |
| "reward_std": 0.6242659687995911, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.7109723091125488, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 386.79168701171875, | |
| "epoch": 0.026508972267536703, | |
| "grad_norm": 1.4313328997618522, | |
| "kl": 0.07373046875, | |
| "learning_rate": 9.259597044191635e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8375002145767212, | |
| "reward_std": 0.8174974322319031, | |
| "rewards/equation_reward_func": 0.2916666865348816, | |
| "rewards/format_reward_func": 1.5458334684371948, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 444.9791717529297, | |
| "epoch": 0.026916802610114192, | |
| "grad_norm": 0.7975312553477618, | |
| "kl": 0.058837890625, | |
| "learning_rate": 9.230476262104676e-07, | |
| "loss": 0.0001, | |
| "reward": 1.977222204208374, | |
| "reward_std": 0.6751963198184967, | |
| "rewards/equation_reward_func": 0.1666666716337204, | |
| "rewards/format_reward_func": 1.8105555772781372, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 399.4791717529297, | |
| "epoch": 0.02732463295269168, | |
| "grad_norm": 1.57086996187115, | |
| "kl": 0.071044921875, | |
| "learning_rate": 9.200841442414105e-07, | |
| "loss": 0.0001, | |
| "reward": 1.7256250977516174, | |
| "reward_std": 0.631768524646759, | |
| "rewards/equation_reward_func": 0.0833333358168602, | |
| "rewards/format_reward_func": 1.6422916650772095, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 497.08335876464844, | |
| "epoch": 0.02773246329526917, | |
| "grad_norm": 0.8207193125582591, | |
| "kl": 0.078857421875, | |
| "learning_rate": 9.17069618599385e-07, | |
| "loss": 0.0001, | |
| "reward": 1.752673625946045, | |
| "reward_std": 0.6211968958377838, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.7526736855506897, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 456.29168701171875, | |
| "epoch": 0.028140293637846654, | |
| "grad_norm": 0.9050345890235396, | |
| "kl": 0.0628662109375, | |
| "learning_rate": 9.1400441557401e-07, | |
| "loss": 0.0001, | |
| "reward": 1.7863542437553406, | |
| "reward_std": 0.5832376182079315, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 1.7446874976158142, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 581.5416870117188, | |
| "epoch": 0.028548123980424143, | |
| "grad_norm": 0.8497382573737793, | |
| "kl": 0.060791015625, | |
| "learning_rate": 9.108889076126225e-07, | |
| "loss": 0.0001, | |
| "reward": 1.7704166769981384, | |
| "reward_std": 0.7596422731876373, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.7704167366027832, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 409.06251525878906, | |
| "epoch": 0.02895595432300163, | |
| "grad_norm": 1.4428479278815958, | |
| "kl": 0.0732421875, | |
| "learning_rate": 9.077234732750223e-07, | |
| "loss": 0.0001, | |
| "reward": 2.054965376853943, | |
| "reward_std": 0.8280318379402161, | |
| "rewards/equation_reward_func": 0.4166666865348816, | |
| "rewards/format_reward_func": 1.6382986307144165, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 399.1875, | |
| "epoch": 0.02936378466557912, | |
| "grad_norm": 0.850509849129193, | |
| "kl": 0.068115234375, | |
| "learning_rate": 9.045084971874737e-07, | |
| "loss": 0.0001, | |
| "reward": 2.358993172645569, | |
| "reward_std": 0.7760606110095978, | |
| "rewards/equation_reward_func": 0.625, | |
| "rewards/format_reward_func": 1.7339931726455688, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 384.2708435058594, | |
| "epoch": 0.029771615008156605, | |
| "grad_norm": 1.3099408456816737, | |
| "kl": 0.10888671875, | |
| "learning_rate": 9.012443699959704e-07, | |
| "loss": 0.0001, | |
| "reward": 2.7191320657730103, | |
| "reward_std": 0.9904708862304688, | |
| "rewards/equation_reward_func": 1.0, | |
| "rewards/format_reward_func": 1.7191320657730103, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 464.12501525878906, | |
| "epoch": 0.030179445350734094, | |
| "grad_norm": 0.9088601877007455, | |
| "kl": 0.08056640625, | |
| "learning_rate": 8.979314883187692e-07, | |
| "loss": 0.0001, | |
| "reward": 2.0606598258018494, | |
| "reward_std": 0.852357029914856, | |
| "rewards/equation_reward_func": 0.3333333432674408, | |
| "rewards/format_reward_func": 1.7273263931274414, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 566.8750305175781, | |
| "epoch": 0.030587275693311582, | |
| "grad_norm": 0.8778862901380006, | |
| "kl": 0.067626953125, | |
| "learning_rate": 8.945702546981968e-07, | |
| "loss": 0.0001, | |
| "reward": 1.779270887374878, | |
| "reward_std": 0.693590372800827, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.7792708277702332, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 429.3333435058594, | |
| "epoch": 0.03099510603588907, | |
| "grad_norm": 0.9577088802916648, | |
| "kl": 0.096923828125, | |
| "learning_rate": 8.911610775517382e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8277431726455688, | |
| "reward_std": 0.6039248108863831, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 1.7860764265060425, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 565.4791870117188, | |
| "epoch": 0.031402936378466556, | |
| "grad_norm": 0.8371289059796367, | |
| "kl": 0.08740234375, | |
| "learning_rate": 8.877043711224107e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9469445943832397, | |
| "reward_std": 0.5404301732778549, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.946944534778595, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 541.1666717529297, | |
| "epoch": 0.03181076672104405, | |
| "grad_norm": 0.7969331394521091, | |
| "kl": 0.079833984375, | |
| "learning_rate": 8.842005554284295e-07, | |
| "loss": 0.0001, | |
| "reward": 2.1353471875190735, | |
| "reward_std": 0.8812746703624725, | |
| "rewards/equation_reward_func": 0.2916666865348816, | |
| "rewards/format_reward_func": 1.8436806201934814, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 464.37501525878906, | |
| "epoch": 0.03221859706362153, | |
| "grad_norm": 1.351816458116067, | |
| "kl": 0.078857421875, | |
| "learning_rate": 8.806500562121722e-07, | |
| "loss": 0.0001, | |
| "reward": 2.5637847781181335, | |
| "reward_std": 0.8339135944843292, | |
| "rewards/equation_reward_func": 0.7083333544433117, | |
| "rewards/format_reward_func": 1.8554513454437256, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 629.9583435058594, | |
| "epoch": 0.03262642740619902, | |
| "grad_norm": 0.9040821275296973, | |
| "kl": 0.06396484375, | |
| "learning_rate": 8.77053304888448e-07, | |
| "loss": 0.0001, | |
| "reward": 1.7654513716697693, | |
| "reward_std": 0.9269569218158722, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 1.7237846851348877, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 637.7500305175781, | |
| "epoch": 0.03303425774877651, | |
| "grad_norm": 0.8146639791448892, | |
| "kl": 0.065673828125, | |
| "learning_rate": 8.734107384920769e-07, | |
| "loss": 0.0001, | |
| "reward": 1.7398958802223206, | |
| "reward_std": 0.8910411596298218, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.7398958802223206, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 553.3541870117188, | |
| "epoch": 0.033442088091353996, | |
| "grad_norm": 1.0794685996564912, | |
| "kl": 0.08251953125, | |
| "learning_rate": 8.69722799624786e-07, | |
| "loss": 0.0001, | |
| "reward": 2.068472385406494, | |
| "reward_std": 0.7307632863521576, | |
| "rewards/equation_reward_func": 0.125, | |
| "rewards/format_reward_func": 1.943472445011139, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 510.75, | |
| "epoch": 0.03384991843393149, | |
| "grad_norm": 1.5254552695681238, | |
| "kl": 0.084716796875, | |
| "learning_rate": 8.659899364014308e-07, | |
| "loss": 0.0001, | |
| "reward": 2.1311458945274353, | |
| "reward_std": 0.8883395195007324, | |
| "rewards/equation_reward_func": 0.3333333432674408, | |
| "rewards/format_reward_func": 1.7978126406669617, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 581.5416870117188, | |
| "epoch": 0.03425774877650897, | |
| "grad_norm": 0.9271897233542844, | |
| "kl": 0.07763671875, | |
| "learning_rate": 8.622126023955445e-07, | |
| "loss": 0.0001, | |
| "reward": 1.963923692703247, | |
| "reward_std": 0.6030838936567307, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.9639238119125366, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 555.3125152587891, | |
| "epoch": 0.03466557911908646, | |
| "grad_norm": 1.4561466805793857, | |
| "kl": 0.070556640625, | |
| "learning_rate": 8.583912565842256e-07, | |
| "loss": 0.0001, | |
| "reward": 2.237326502799988, | |
| "reward_std": 0.8882516920566559, | |
| "rewards/equation_reward_func": 0.5, | |
| "rewards/format_reward_func": 1.7373263835906982, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 421.2708435058594, | |
| "epoch": 0.03507340946166395, | |
| "grad_norm": 1.3044883746742693, | |
| "kl": 0.10107421875, | |
| "learning_rate": 8.545263632923686e-07, | |
| "loss": 0.0001, | |
| "reward": 2.6816667318344116, | |
| "reward_std": 1.1341252326965332, | |
| "rewards/equation_reward_func": 1.041666716337204, | |
| "rewards/format_reward_func": 1.64000004529953, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 592.6458435058594, | |
| "epoch": 0.035481239804241435, | |
| "grad_norm": 0.9961988228794976, | |
| "kl": 0.066162109375, | |
| "learning_rate": 8.506183921362442e-07, | |
| "loss": 0.0001, | |
| "reward": 2.093229293823242, | |
| "reward_std": 0.8178855180740356, | |
| "rewards/equation_reward_func": 0.1666666716337204, | |
| "rewards/format_reward_func": 1.9265625476837158, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 332.0208435058594, | |
| "epoch": 0.03588907014681892, | |
| "grad_norm": 1.5901752280491694, | |
| "kl": 0.099609375, | |
| "learning_rate": 8.466678179664377e-07, | |
| "loss": 0.0001, | |
| "reward": 2.9652082920074463, | |
| "reward_std": 0.8383155167102814, | |
| "rewards/equation_reward_func": 1.2500000596046448, | |
| "rewards/format_reward_func": 1.715208351612091, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 460.6666717529297, | |
| "epoch": 0.03629690048939641, | |
| "grad_norm": 1.6000004098847773, | |
| "kl": 0.16796875, | |
| "learning_rate": 8.426751208101499e-07, | |
| "loss": 0.0002, | |
| "reward": 2.6753125190734863, | |
| "reward_std": 1.0194191336631775, | |
| "rewards/equation_reward_func": 0.916666716337204, | |
| "rewards/format_reward_func": 1.7586458325386047, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 477.3125, | |
| "epoch": 0.0367047308319739, | |
| "grad_norm": 1.2239460420462749, | |
| "kl": 0.088623046875, | |
| "learning_rate": 8.386407858128706e-07, | |
| "loss": 0.0001, | |
| "reward": 2.590486168861389, | |
| "reward_std": 0.9470961093902588, | |
| "rewards/equation_reward_func": 0.75, | |
| "rewards/format_reward_func": 1.8404861688613892, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 444.5416717529297, | |
| "epoch": 0.03711256117455139, | |
| "grad_norm": 1.230296809506791, | |
| "kl": 0.10107421875, | |
| "learning_rate": 8.34565303179429e-07, | |
| "loss": 0.0001, | |
| "reward": 2.523506999015808, | |
| "reward_std": 0.9190675318241119, | |
| "rewards/equation_reward_func": 0.6666666865348816, | |
| "rewards/format_reward_func": 1.8568402528762817, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 556.7916870117188, | |
| "epoch": 0.037520391517128875, | |
| "grad_norm": 0.9132808347888518, | |
| "kl": 0.083984375, | |
| "learning_rate": 8.304491681144305e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9321181178092957, | |
| "reward_std": 0.6165703535079956, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.9321181774139404, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 329.1666717529297, | |
| "epoch": 0.03792822185970636, | |
| "grad_norm": 1.5619424825885055, | |
| "kl": 0.118408203125, | |
| "learning_rate": 8.262928807620843e-07, | |
| "loss": 0.0001, | |
| "reward": 3.2064584493637085, | |
| "reward_std": 0.8171246647834778, | |
| "rewards/equation_reward_func": 1.5000000596046448, | |
| "rewards/format_reward_func": 1.706458330154419, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 388.1041717529297, | |
| "epoch": 0.03833605220228385, | |
| "grad_norm": 1.5560017569200078, | |
| "kl": 0.115478515625, | |
| "learning_rate": 8.220969461454321e-07, | |
| "loss": 0.0001, | |
| "reward": 2.5811806321144104, | |
| "reward_std": 0.5943560600280762, | |
| "rewards/equation_reward_func": 0.9166666865348816, | |
| "rewards/format_reward_func": 1.6645139455795288, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 502.33335876464844, | |
| "epoch": 0.03874388254486134, | |
| "grad_norm": 1.3187487188473963, | |
| "kl": 0.103515625, | |
| "learning_rate": 8.178618741049841e-07, | |
| "loss": 0.0001, | |
| "reward": 2.311215341091156, | |
| "reward_std": 0.7084816992282867, | |
| "rewards/equation_reward_func": 0.375, | |
| "rewards/format_reward_func": 1.936215341091156, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 347.29168701171875, | |
| "epoch": 0.03915171288743882, | |
| "grad_norm": 1.0778244344859724, | |
| "kl": 0.1083984375, | |
| "learning_rate": 8.135881792367685e-07, | |
| "loss": 0.0001, | |
| "reward": 2.41055566072464, | |
| "reward_std": 0.7850378751754761, | |
| "rewards/equation_reward_func": 0.7500000409781933, | |
| "rewards/format_reward_func": 1.66055566072464, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 335.9791717529297, | |
| "epoch": 0.039559543230016314, | |
| "grad_norm": 1.522034238839679, | |
| "kl": 0.105712890625, | |
| "learning_rate": 8.092763808298046e-07, | |
| "loss": 0.0001, | |
| "reward": 2.9850348234176636, | |
| "reward_std": 0.9680465757846832, | |
| "rewards/equation_reward_func": 1.375, | |
| "rewards/format_reward_func": 1.610034704208374, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 586.5625305175781, | |
| "epoch": 0.0399673735725938, | |
| "grad_norm": 0.6708954041029114, | |
| "kl": 0.10693359375, | |
| "learning_rate": 8.049270028030045e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8110415935516357, | |
| "reward_std": 0.7432913780212402, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.8110417127609253, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 492.7916717529297, | |
| "epoch": 0.04037520391517129, | |
| "grad_norm": 1.1370661223441034, | |
| "kl": 0.087646484375, | |
| "learning_rate": 8.005405736415125e-07, | |
| "loss": 0.0001, | |
| "reward": 2.2512154579162598, | |
| "reward_std": 0.7727322578430176, | |
| "rewards/equation_reward_func": 0.2916666716337204, | |
| "rewards/format_reward_func": 1.9595486521720886, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 618.7708740234375, | |
| "epoch": 0.040783034257748776, | |
| "grad_norm": 0.9472446872281457, | |
| "kl": 0.081298828125, | |
| "learning_rate": 7.961176263324901e-07, | |
| "loss": 0.0001, | |
| "reward": 2.0697221755981445, | |
| "reward_std": 0.9369174838066101, | |
| "rewards/equation_reward_func": 0.2083333432674408, | |
| "rewards/format_reward_func": 1.861388921737671, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 485.1666717529297, | |
| "epoch": 0.04119086460032626, | |
| "grad_norm": 0.9224224522873015, | |
| "kl": 0.21337890625, | |
| "learning_rate": 7.916586983003533e-07, | |
| "loss": 0.0002, | |
| "reward": 2.622014045715332, | |
| "reward_std": 0.832764744758606, | |
| "rewards/equation_reward_func": 0.7916666865348816, | |
| "rewards/format_reward_func": 1.8303472995758057, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 498.3125, | |
| "epoch": 0.041598694942903754, | |
| "grad_norm": 0.8732846033594475, | |
| "kl": 0.09912109375, | |
| "learning_rate": 7.871643313414718e-07, | |
| "loss": 0.0001, | |
| "reward": 2.6251736879348755, | |
| "reward_std": 0.7892851531505585, | |
| "rewards/equation_reward_func": 0.875, | |
| "rewards/format_reward_func": 1.7501736879348755, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 436.4375, | |
| "epoch": 0.04200652528548124, | |
| "grad_norm": 1.182953041273801, | |
| "kl": 0.111328125, | |
| "learning_rate": 7.826350715583358e-07, | |
| "loss": 0.0001, | |
| "reward": 2.4944097995758057, | |
| "reward_std": 0.7472249865531921, | |
| "rewards/equation_reward_func": 0.7083333730697632, | |
| "rewards/format_reward_func": 1.7860764265060425, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 326.7083435058594, | |
| "epoch": 0.04241435562805873, | |
| "grad_norm": 1.6056713694544402, | |
| "kl": 0.117919921875, | |
| "learning_rate": 7.780714692932002e-07, | |
| "loss": 0.0001, | |
| "reward": 2.952360987663269, | |
| "reward_std": 0.8980874419212341, | |
| "rewards/equation_reward_func": 1.2500000596046448, | |
| "rewards/format_reward_func": 1.7023611664772034, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 470.50001525878906, | |
| "epoch": 0.042822185970636216, | |
| "grad_norm": 1.0414434379733573, | |
| "kl": 0.093994140625, | |
| "learning_rate": 7.734740790612136e-07, | |
| "loss": 0.0001, | |
| "reward": 2.6838542222976685, | |
| "reward_std": 1.0669545829296112, | |
| "rewards/equation_reward_func": 0.7916666865348816, | |
| "rewards/format_reward_func": 1.8921875357627869, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 577.0625305175781, | |
| "epoch": 0.0432300163132137, | |
| "grad_norm": 1.0292452997964692, | |
| "kl": 0.088134765625, | |
| "learning_rate": 7.688434594830391e-07, | |
| "loss": 0.0001, | |
| "reward": 1.7830208539962769, | |
| "reward_std": 0.7203674912452698, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.7830208539962769, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 443.1875, | |
| "epoch": 0.04363784665579119, | |
| "grad_norm": 1.1857098524166056, | |
| "kl": 0.169921875, | |
| "learning_rate": 7.641801732169795e-07, | |
| "loss": 0.0002, | |
| "reward": 2.93423593044281, | |
| "reward_std": 1.0427783131599426, | |
| "rewards/equation_reward_func": 1.125, | |
| "rewards/format_reward_func": 1.809236228466034, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 489.66668701171875, | |
| "epoch": 0.04404567699836868, | |
| "grad_norm": 0.8656675869839974, | |
| "kl": 0.13232421875, | |
| "learning_rate": 7.594847868906076e-07, | |
| "loss": 0.0001, | |
| "reward": 2.7152432203292847, | |
| "reward_std": 1.0624222159385681, | |
| "rewards/equation_reward_func": 1.1250000298023224, | |
| "rewards/format_reward_func": 1.59024316072464, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 396.4166717529297, | |
| "epoch": 0.04445350734094616, | |
| "grad_norm": 1.2559326528283787, | |
| "kl": 0.1279296875, | |
| "learning_rate": 7.547578710319174e-07, | |
| "loss": 0.0001, | |
| "reward": 3.0093750953674316, | |
| "reward_std": 0.960063099861145, | |
| "rewards/equation_reward_func": 1.2916666865348816, | |
| "rewards/format_reward_func": 1.71770840883255, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 444.39585876464844, | |
| "epoch": 0.044861337683523655, | |
| "grad_norm": 1.0899183864856226, | |
| "kl": 0.378173828125, | |
| "learning_rate": 7.5e-07, | |
| "loss": 0.0004, | |
| "reward": 2.7316668033599854, | |
| "reward_std": 0.6873580813407898, | |
| "rewards/equation_reward_func": 0.8333333358168602, | |
| "rewards/format_reward_func": 1.898333489894867, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 464.70835876464844, | |
| "epoch": 0.04526916802610114, | |
| "grad_norm": 0.8216128962927133, | |
| "kl": 0.115966796875, | |
| "learning_rate": 7.452117519152541e-07, | |
| "loss": 0.0001, | |
| "reward": 2.5972570180892944, | |
| "reward_std": 0.7697752714157104, | |
| "rewards/equation_reward_func": 0.75, | |
| "rewards/format_reward_func": 1.8472568988800049, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 531.8958587646484, | |
| "epoch": 0.04567699836867863, | |
| "grad_norm": 0.8041197621718661, | |
| "kl": 0.144775390625, | |
| "learning_rate": 7.403937085891397e-07, | |
| "loss": 0.0001, | |
| "reward": 2.5897916555404663, | |
| "reward_std": 0.7150984704494476, | |
| "rewards/equation_reward_func": 0.9166666865348816, | |
| "rewards/format_reward_func": 1.6731250286102295, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 647.2500305175781, | |
| "epoch": 0.04608482871125612, | |
| "grad_norm": 1.1739617017348787, | |
| "kl": 0.093017578125, | |
| "learning_rate": 7.355464554534836e-07, | |
| "loss": 0.0001, | |
| "reward": 2.5721182823181152, | |
| "reward_std": 1.2919026017189026, | |
| "rewards/equation_reward_func": 0.7083333432674408, | |
| "rewards/format_reward_func": 1.8637848496437073, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 618.6041870117188, | |
| "epoch": 0.0464926590538336, | |
| "grad_norm": 0.786794707945746, | |
| "kl": 0.091064453125, | |
| "learning_rate": 7.306705814893439e-07, | |
| "loss": 0.0001, | |
| "reward": 2.3971527814865112, | |
| "reward_std": 0.9052496254444122, | |
| "rewards/equation_reward_func": 0.5, | |
| "rewards/format_reward_func": 1.8971527814865112, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 378.6041717529297, | |
| "epoch": 0.046900489396411095, | |
| "grad_norm": 1.1431295777314772, | |
| "kl": 0.14111328125, | |
| "learning_rate": 7.257666791554447e-07, | |
| "loss": 0.0001, | |
| "reward": 3.0398958921432495, | |
| "reward_std": 0.7211508750915527, | |
| "rewards/equation_reward_func": 1.25, | |
| "rewards/format_reward_func": 1.7898958921432495, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 390.97918701171875, | |
| "epoch": 0.04730831973898858, | |
| "grad_norm": 1.3315143130569003, | |
| "kl": 0.1064453125, | |
| "learning_rate": 7.20835344316187e-07, | |
| "loss": 0.0001, | |
| "reward": 3.165451407432556, | |
| "reward_std": 0.8909177780151367, | |
| "rewards/equation_reward_func": 1.375, | |
| "rewards/format_reward_func": 1.7904514074325562, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 535.4166870117188, | |
| "epoch": 0.047716150081566065, | |
| "grad_norm": 0.8301567265751203, | |
| "kl": 0.10302734375, | |
| "learning_rate": 7.158771761692464e-07, | |
| "loss": 0.0001, | |
| "reward": 2.5394792556762695, | |
| "reward_std": 0.8184142112731934, | |
| "rewards/equation_reward_func": 0.75, | |
| "rewards/format_reward_func": 1.7894791960716248, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 569.0833435058594, | |
| "epoch": 0.04812398042414356, | |
| "grad_norm": 0.7756037952607522, | |
| "kl": 0.11376953125, | |
| "learning_rate": 7.108927771727661e-07, | |
| "loss": 0.0001, | |
| "reward": 2.432673692703247, | |
| "reward_std": 0.8917776942253113, | |
| "rewards/equation_reward_func": 0.5416666865348816, | |
| "rewards/format_reward_func": 1.8910069465637207, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 668.8125305175781, | |
| "epoch": 0.04853181076672104, | |
| "grad_norm": 1.0059181715098997, | |
| "kl": 0.084228515625, | |
| "learning_rate": 7.058827529721525e-07, | |
| "loss": 0.0001, | |
| "reward": 1.8420140147209167, | |
| "reward_std": 0.8704274594783783, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.842013955116272, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 530.625, | |
| "epoch": 0.048939641109298535, | |
| "grad_norm": 0.9400020844643003, | |
| "kl": 0.095458984375, | |
| "learning_rate": 7.008477123264847e-07, | |
| "loss": 0.0001, | |
| "reward": 2.611979365348816, | |
| "reward_std": 0.9893729388713837, | |
| "rewards/equation_reward_func": 0.6250000298023224, | |
| "rewards/format_reward_func": 1.9869792461395264, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 593.2083740234375, | |
| "epoch": 0.04934747145187602, | |
| "grad_norm": 0.7712890526722358, | |
| "kl": 0.086669921875, | |
| "learning_rate": 6.957882670345458e-07, | |
| "loss": 0.0001, | |
| "reward": 2.4697917699813843, | |
| "reward_std": 1.0504232347011566, | |
| "rewards/equation_reward_func": 0.5416666865348816, | |
| "rewards/format_reward_func": 1.9281250834465027, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 594.3958435058594, | |
| "epoch": 0.049755301794453505, | |
| "grad_norm": 0.8588310596425649, | |
| "kl": 0.0986328125, | |
| "learning_rate": 6.90705031860483e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9615973234176636, | |
| "reward_std": 0.6058675646781921, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.9615971446037292, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 448.3125228881836, | |
| "epoch": 0.050163132137031, | |
| "grad_norm": 1.8069886458180306, | |
| "kl": 0.113525390625, | |
| "learning_rate": 6.855986244591103e-07, | |
| "loss": 0.0001, | |
| "reward": 2.5989930629730225, | |
| "reward_std": 0.7186898589134216, | |
| "rewards/equation_reward_func": 0.8333333730697632, | |
| "rewards/format_reward_func": 1.7656598091125488, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 487.33335876464844, | |
| "epoch": 0.05057096247960848, | |
| "grad_norm": 1.9638362270119054, | |
| "kl": 0.1494140625, | |
| "learning_rate": 6.804696653008574e-07, | |
| "loss": 0.0001, | |
| "reward": 2.645763874053955, | |
| "reward_std": 0.979635089635849, | |
| "rewards/equation_reward_func": 0.8333333730697632, | |
| "rewards/format_reward_func": 1.8124305605888367, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 536.5833740234375, | |
| "epoch": 0.050978792822185974, | |
| "grad_norm": 0.8482235665719111, | |
| "kl": 0.094482421875, | |
| "learning_rate": 6.753187775963772e-07, | |
| "loss": 0.0001, | |
| "reward": 1.9076389074325562, | |
| "reward_std": 0.5429915189743042, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.9076389074325562, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 398.5208435058594, | |
| "epoch": 0.05138662316476346, | |
| "grad_norm": 1.5501066370417451, | |
| "kl": 0.10986328125, | |
| "learning_rate": 6.701465872208216e-07, | |
| "loss": 0.0001, | |
| "reward": 2.95270836353302, | |
| "reward_std": 0.8335215449333191, | |
| "rewards/equation_reward_func": 1.166666716337204, | |
| "rewards/format_reward_func": 1.7860416173934937, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 427.0000305175781, | |
| "epoch": 0.051794453507340944, | |
| "grad_norm": 0.8535810763610331, | |
| "kl": 0.13330078125, | |
| "learning_rate": 6.649537226377914e-07, | |
| "loss": 0.0001, | |
| "reward": 2.610729217529297, | |
| "reward_std": 0.6564360558986664, | |
| "rewards/equation_reward_func": 0.7916666865348816, | |
| "rewards/format_reward_func": 1.8190626502037048, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 459.0208435058594, | |
| "epoch": 0.052202283849918436, | |
| "grad_norm": 1.253749573558056, | |
| "kl": 0.12646484375, | |
| "learning_rate": 6.597408148229741e-07, | |
| "loss": 0.0001, | |
| "reward": 2.7206597328186035, | |
| "reward_std": 0.5680619776248932, | |
| "rewards/equation_reward_func": 0.75, | |
| "rewards/format_reward_func": 1.970659613609314, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 459.16668701171875, | |
| "epoch": 0.05261011419249592, | |
| "grad_norm": 1.5903888848286962, | |
| "kl": 0.12060546875, | |
| "learning_rate": 6.545084971874736e-07, | |
| "loss": 0.0001, | |
| "reward": 2.6335763931274414, | |
| "reward_std": 0.6806207001209259, | |
| "rewards/equation_reward_func": 0.7083333730697632, | |
| "rewards/format_reward_func": 1.925243079662323, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 390.5208435058594, | |
| "epoch": 0.05301794453507341, | |
| "grad_norm": 1.5527056038492013, | |
| "kl": 0.118408203125, | |
| "learning_rate": 6.492574055008473e-07, | |
| "loss": 0.0001, | |
| "reward": 2.4415969848632812, | |
| "reward_std": 0.600139319896698, | |
| "rewards/equation_reward_func": 0.625, | |
| "rewards/format_reward_func": 1.81659734249115, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 531.9166870117188, | |
| "epoch": 0.0534257748776509, | |
| "grad_norm": 1.4622847388569014, | |
| "kl": 0.12255859375, | |
| "learning_rate": 6.439881778138531e-07, | |
| "loss": 0.0001, | |
| "reward": 2.7648611068725586, | |
| "reward_std": 0.6601312011480331, | |
| "rewards/equation_reward_func": 0.7083333730697632, | |
| "rewards/format_reward_func": 2.056527853012085, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 515.7916870117188, | |
| "epoch": 0.053833605220228384, | |
| "grad_norm": 1.386912182976191, | |
| "kl": 0.138427734375, | |
| "learning_rate": 6.387014543809223e-07, | |
| "loss": 0.0001, | |
| "reward": 2.7789584398269653, | |
| "reward_std": 0.6216670870780945, | |
| "rewards/equation_reward_func": 0.8333333730697632, | |
| "rewards/format_reward_func": 1.9456250667572021, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 350.0208435058594, | |
| "epoch": 0.054241435562805876, | |
| "grad_norm": 1.32827783707619, | |
| "kl": 0.14892578125, | |
| "learning_rate": 6.333978775823631e-07, | |
| "loss": 0.0001, | |
| "reward": 3.3326735496520996, | |
| "reward_std": 0.5917892754077911, | |
| "rewards/equation_reward_func": 1.5833333730697632, | |
| "rewards/format_reward_func": 1.7493401765823364, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 418.87501525878906, | |
| "epoch": 0.05464926590538336, | |
| "grad_norm": 1.7098008888477785, | |
| "kl": 0.162109375, | |
| "learning_rate": 6.280780918463057e-07, | |
| "loss": 0.0002, | |
| "reward": 2.589617967605591, | |
| "reward_std": 0.6024105995893478, | |
| "rewards/equation_reward_func": 0.75, | |
| "rewards/format_reward_func": 1.8396180868148804, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 513.5625152587891, | |
| "epoch": 0.055057096247960846, | |
| "grad_norm": 1.117454148344459, | |
| "kl": 0.16162109375, | |
| "learning_rate": 6.227427435703995e-07, | |
| "loss": 0.0002, | |
| "reward": 2.7452430725097656, | |
| "reward_std": 1.3093486428260803, | |
| "rewards/equation_reward_func": 1.125, | |
| "rewards/format_reward_func": 1.6202431321144104, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 616.9166870117188, | |
| "epoch": 0.05546492659053834, | |
| "grad_norm": 0.9401293165638854, | |
| "kl": 0.1728515625, | |
| "learning_rate": 6.173924810432704e-07, | |
| "loss": 0.0002, | |
| "reward": 2.637951374053955, | |
| "reward_std": 0.9178789854049683, | |
| "rewards/equation_reward_func": 0.625, | |
| "rewards/format_reward_func": 2.0129514336586, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 651.5833740234375, | |
| "epoch": 0.05587275693311582, | |
| "grad_norm": 0.9567005196766424, | |
| "kl": 0.14697265625, | |
| "learning_rate": 6.12027954365748e-07, | |
| "loss": 0.0001, | |
| "reward": 1.926597237586975, | |
| "reward_std": 0.8993740975856781, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 1.8849304914474487, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 634.5208435058594, | |
| "epoch": 0.05628058727569331, | |
| "grad_norm": 1.358076325982119, | |
| "kl": 0.13427734375, | |
| "learning_rate": 6.066498153718734e-07, | |
| "loss": 0.0001, | |
| "reward": 2.3687500953674316, | |
| "reward_std": 1.000350534915924, | |
| "rewards/equation_reward_func": 0.4166666865348816, | |
| "rewards/format_reward_func": 1.9520832300186157, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 517.4166870117188, | |
| "epoch": 0.0566884176182708, | |
| "grad_norm": 1.5910966658956645, | |
| "kl": 0.1484375, | |
| "learning_rate": 6.01258717549696e-07, | |
| "loss": 0.0001, | |
| "reward": 2.7329167127609253, | |
| "reward_std": 0.6304636597633362, | |
| "rewards/equation_reward_func": 1.0, | |
| "rewards/format_reward_func": 1.7329167127609253, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 693.8125305175781, | |
| "epoch": 0.057096247960848286, | |
| "grad_norm": 1.0227721122110067, | |
| "kl": 0.12451171875, | |
| "learning_rate": 5.958553159618692e-07, | |
| "loss": 0.0001, | |
| "reward": 2.247395873069763, | |
| "reward_std": 0.8254929631948471, | |
| "rewards/equation_reward_func": 0.1666666716337204, | |
| "rewards/format_reward_func": 2.0807292461395264, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 699.3125305175781, | |
| "epoch": 0.05750407830342578, | |
| "grad_norm": 0.9489795742265218, | |
| "kl": 0.1298828125, | |
| "learning_rate": 5.90440267166055e-07, | |
| "loss": 0.0001, | |
| "reward": 2.170659899711609, | |
| "reward_std": 0.7723036706447601, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 2.128993034362793, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 652.3333435058594, | |
| "epoch": 0.05791190864600326, | |
| "grad_norm": 1.2529290369484056, | |
| "kl": 0.13623046875, | |
| "learning_rate": 5.850142291351465e-07, | |
| "loss": 0.0001, | |
| "reward": 2.2495139837265015, | |
| "reward_std": 0.8483322262763977, | |
| "rewards/equation_reward_func": 0.2083333432674408, | |
| "rewards/format_reward_func": 2.0411804914474487, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 367.91668701171875, | |
| "epoch": 0.05831973898858075, | |
| "grad_norm": 1.40910503148467, | |
| "kl": 0.16064453125, | |
| "learning_rate": 5.795778611773197e-07, | |
| "loss": 0.0002, | |
| "reward": 3.3931944370269775, | |
| "reward_std": 0.7464114725589752, | |
| "rewards/equation_reward_func": 1.5833333730697632, | |
| "rewards/format_reward_func": 1.809861183166504, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 525.25, | |
| "epoch": 0.05872756933115824, | |
| "grad_norm": 1.4485621962260458, | |
| "kl": 0.150390625, | |
| "learning_rate": 5.741318238559209e-07, | |
| "loss": 0.0002, | |
| "reward": 3.194236159324646, | |
| "reward_std": 0.8501316905021667, | |
| "rewards/equation_reward_func": 1.166666716337204, | |
| "rewards/format_reward_func": 2.027569532394409, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 683.4166870117188, | |
| "epoch": 0.059135399673735725, | |
| "grad_norm": 1.1717608782080553, | |
| "kl": 0.1416015625, | |
| "learning_rate": 5.686767789092041e-07, | |
| "loss": 0.0001, | |
| "reward": 2.2353820204734802, | |
| "reward_std": 1.0596205294132233, | |
| "rewards/equation_reward_func": 0.2916666865348816, | |
| "rewards/format_reward_func": 1.9437153339385986, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 401.06251525878906, | |
| "epoch": 0.05954323001631321, | |
| "grad_norm": 1.469208845355834, | |
| "kl": 0.18115234375, | |
| "learning_rate": 5.632133891699231e-07, | |
| "loss": 0.0002, | |
| "reward": 3.5194097757339478, | |
| "reward_std": 0.6208974719047546, | |
| "rewards/equation_reward_func": 1.7083333730697632, | |
| "rewards/format_reward_func": 1.8110764622688293, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 726.2916870117188, | |
| "epoch": 0.0599510603588907, | |
| "grad_norm": 1.3400119923719016, | |
| "kl": 0.135009765625, | |
| "learning_rate": 5.577423184847931e-07, | |
| "loss": 0.0001, | |
| "reward": 2.288576364517212, | |
| "reward_std": 0.4163784384727478, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 2.288576364517212, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 408.81251525878906, | |
| "epoch": 0.06035889070146819, | |
| "grad_norm": 1.6377898052337172, | |
| "kl": 0.16259765625, | |
| "learning_rate": 5.522642316338268e-07, | |
| "loss": 0.0002, | |
| "reward": 3.5491667985916138, | |
| "reward_std": 0.6619178652763367, | |
| "rewards/equation_reward_func": 1.6666666865348816, | |
| "rewards/format_reward_func": 1.8825000524520874, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 662.7916870117188, | |
| "epoch": 0.06076672104404568, | |
| "grad_norm": 1.0601282664430811, | |
| "kl": 0.16845703125, | |
| "learning_rate": 5.467797942495589e-07, | |
| "loss": 0.0002, | |
| "reward": 2.235729455947876, | |
| "reward_std": 1.0819981396198273, | |
| "rewards/equation_reward_func": 0.3333333544433117, | |
| "rewards/format_reward_func": 1.9023959636688232, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 563.1875305175781, | |
| "epoch": 0.061174551386623165, | |
| "grad_norm": 0.9614419208572907, | |
| "kl": 0.193359375, | |
| "learning_rate": 5.412896727361662e-07, | |
| "loss": 0.0002, | |
| "reward": 2.6000348329544067, | |
| "reward_std": 0.9562007784843445, | |
| "rewards/equation_reward_func": 0.7916666865348816, | |
| "rewards/format_reward_func": 1.8083681464195251, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 819.2916870117188, | |
| "epoch": 0.06158238172920065, | |
| "grad_norm": 0.9771200528468638, | |
| "kl": 0.128662109375, | |
| "learning_rate": 5.357945341884935e-07, | |
| "loss": 0.0001, | |
| "reward": 1.7778472304344177, | |
| "reward_std": 1.077535629272461, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.7778472900390625, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 553.4166870117188, | |
| "epoch": 0.06199021207177814, | |
| "grad_norm": 1.1955694878738563, | |
| "kl": 0.23388671875, | |
| "learning_rate": 5.302950463109969e-07, | |
| "loss": 0.0002, | |
| "reward": 2.829687714576721, | |
| "reward_std": 1.1161695718765259, | |
| "rewards/equation_reward_func": 0.916666716337204, | |
| "rewards/format_reward_func": 1.9130208492279053, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 710.7291870117188, | |
| "epoch": 0.06239804241435563, | |
| "grad_norm": 1.3779449810449336, | |
| "kl": 0.1650390625, | |
| "learning_rate": 5.247918773366111e-07, | |
| "loss": 0.0002, | |
| "reward": 2.022604286670685, | |
| "reward_std": 0.8615612387657166, | |
| "rewards/equation_reward_func": 0.0833333358168602, | |
| "rewards/format_reward_func": 1.9392709136009216, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 589.2083435058594, | |
| "epoch": 0.06280587275693311, | |
| "grad_norm": 1.1497542405094912, | |
| "kl": 0.1552734375, | |
| "learning_rate": 5.192856959455552e-07, | |
| "loss": 0.0002, | |
| "reward": 3.0122569799423218, | |
| "reward_std": 0.8370742797851562, | |
| "rewards/equation_reward_func": 0.9583333544433117, | |
| "rewards/format_reward_func": 2.0539236068725586, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 373.6041717529297, | |
| "epoch": 0.0632137030995106, | |
| "grad_norm": 1.527186652658105, | |
| "kl": 0.24267578125, | |
| "learning_rate": 5.137771711840811e-07, | |
| "loss": 0.0002, | |
| "reward": 3.4156596660614014, | |
| "reward_std": 0.5625910460948944, | |
| "rewards/equation_reward_func": 1.6250000596046448, | |
| "rewards/format_reward_func": 1.790659785270691, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 395.37501525878906, | |
| "epoch": 0.0636215334420881, | |
| "grad_norm": 1.7273540516921173, | |
| "kl": 0.20361328125, | |
| "learning_rate": 5.082669723831793e-07, | |
| "loss": 0.0002, | |
| "reward": 3.51725697517395, | |
| "reward_std": 0.6541823446750641, | |
| "rewards/equation_reward_func": 1.6666667461395264, | |
| "rewards/format_reward_func": 1.8505903482437134, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 385.5208435058594, | |
| "epoch": 0.06402936378466557, | |
| "grad_norm": 1.698447178314297, | |
| "kl": 0.2001953125, | |
| "learning_rate": 5.027557690772503e-07, | |
| "loss": 0.0002, | |
| "reward": 3.766666889190674, | |
| "reward_std": 0.5374718904495239, | |
| "rewards/equation_reward_func": 1.9166667461395264, | |
| "rewards/format_reward_func": 1.850000023841858, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 597.7083435058594, | |
| "epoch": 0.06443719412724307, | |
| "grad_norm": 1.2167733375338632, | |
| "kl": 0.25439453125, | |
| "learning_rate": 4.972442309227498e-07, | |
| "loss": 0.0003, | |
| "reward": 2.8143749237060547, | |
| "reward_std": 0.9374454021453857, | |
| "rewards/equation_reward_func": 0.9583333544433117, | |
| "rewards/format_reward_func": 1.856041669845581, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 687.9375305175781, | |
| "epoch": 0.06484502446982056, | |
| "grad_norm": 1.1609904401618012, | |
| "kl": 0.17822265625, | |
| "learning_rate": 4.917330276168208e-07, | |
| "loss": 0.0002, | |
| "reward": 2.6170140504837036, | |
| "reward_std": 1.1427516341209412, | |
| "rewards/equation_reward_func": 0.6250000409781933, | |
| "rewards/format_reward_func": 1.9920140504837036, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 684.1250305175781, | |
| "epoch": 0.06525285481239804, | |
| "grad_norm": 1.0056701516120197, | |
| "kl": 0.20947265625, | |
| "learning_rate": 4.86222828815919e-07, | |
| "loss": 0.0002, | |
| "reward": 2.5053821802139282, | |
| "reward_std": 0.9234158992767334, | |
| "rewards/equation_reward_func": 0.4166666865348816, | |
| "rewards/format_reward_func": 2.088715434074402, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 673.8750305175781, | |
| "epoch": 0.06566068515497553, | |
| "grad_norm": 1.0975585144657385, | |
| "kl": 0.18798828125, | |
| "learning_rate": 4.807143040544446e-07, | |
| "loss": 0.0002, | |
| "reward": 3.1653473377227783, | |
| "reward_std": 1.1166218519210815, | |
| "rewards/equation_reward_func": 1.0416666865348816, | |
| "rewards/format_reward_func": 2.123680830001831, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 830.4791870117188, | |
| "epoch": 0.06606851549755302, | |
| "grad_norm": 1.0602033792436276, | |
| "kl": 0.1513671875, | |
| "learning_rate": 4.752081226633888e-07, | |
| "loss": 0.0002, | |
| "reward": 1.821250081062317, | |
| "reward_std": 1.0921660661697388, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.821250081062317, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 639.4375305175781, | |
| "epoch": 0.0664763458401305, | |
| "grad_norm": 1.100729586233426, | |
| "kl": 0.1552734375, | |
| "learning_rate": 4.697049536890033e-07, | |
| "loss": 0.0002, | |
| "reward": 2.8303472995758057, | |
| "reward_std": 0.8279085159301758, | |
| "rewards/equation_reward_func": 0.7916666865348816, | |
| "rewards/format_reward_func": 2.038680672645569, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 686.9375305175781, | |
| "epoch": 0.06688417618270799, | |
| "grad_norm": 1.0245543586959098, | |
| "kl": 0.17041015625, | |
| "learning_rate": 4.642054658115066e-07, | |
| "loss": 0.0002, | |
| "reward": 2.826840400695801, | |
| "reward_std": 0.9287701547145844, | |
| "rewards/equation_reward_func": 0.7916666865348816, | |
| "rewards/format_reward_func": 2.0351736545562744, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 527.375, | |
| "epoch": 0.06729200652528548, | |
| "grad_norm": 1.0458228845510584, | |
| "kl": 0.173828125, | |
| "learning_rate": 4.5871032726383385e-07, | |
| "loss": 0.0002, | |
| "reward": 3.5178821086883545, | |
| "reward_std": 1.0241894721984863, | |
| "rewards/equation_reward_func": 1.5416666865348816, | |
| "rewards/format_reward_func": 1.9762153625488281, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 574.0000152587891, | |
| "epoch": 0.06769983686786298, | |
| "grad_norm": 1.3045692619665175, | |
| "kl": 0.1962890625, | |
| "learning_rate": 4.532202057504411e-07, | |
| "loss": 0.0002, | |
| "reward": 3.1286113262176514, | |
| "reward_std": 1.1157885491847992, | |
| "rewards/equation_reward_func": 1.125, | |
| "rewards/format_reward_func": 2.003611207008362, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 783.9375305175781, | |
| "epoch": 0.06810766721044045, | |
| "grad_norm": 1.0379555283649509, | |
| "kl": 0.15966796875, | |
| "learning_rate": 4.477357683661733e-07, | |
| "loss": 0.0002, | |
| "reward": 2.033749997615814, | |
| "reward_std": 0.8268265128135681, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 2.0337501168251038, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 748.5416870117188, | |
| "epoch": 0.06851549755301795, | |
| "grad_norm": 1.0608562150572372, | |
| "kl": 0.3369140625, | |
| "learning_rate": 4.4225768151520694e-07, | |
| "loss": 0.0003, | |
| "reward": 2.0848264694213867, | |
| "reward_std": 0.8205204904079437, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 2.0848264694213867, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 815.5625305175781, | |
| "epoch": 0.06892332789559544, | |
| "grad_norm": 1.0418888295786408, | |
| "kl": 0.18310546875, | |
| "learning_rate": 4.3678661083007685e-07, | |
| "loss": 0.0002, | |
| "reward": 1.7861458659172058, | |
| "reward_std": 1.0723278522491455, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 1.7861458659172058, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 555.1666717529297, | |
| "epoch": 0.06933115823817292, | |
| "grad_norm": 1.4518474976562326, | |
| "kl": 0.17236328125, | |
| "learning_rate": 4.313232210907959e-07, | |
| "loss": 0.0002, | |
| "reward": 3.1211459636688232, | |
| "reward_std": 0.9402068853378296, | |
| "rewards/equation_reward_func": 1.0416666865348816, | |
| "rewards/format_reward_func": 2.0794793367385864, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 500.79168701171875, | |
| "epoch": 0.06973898858075041, | |
| "grad_norm": 1.5942769271225012, | |
| "kl": 0.20947265625, | |
| "learning_rate": 4.258681761440789e-07, | |
| "loss": 0.0002, | |
| "reward": 3.415902853012085, | |
| "reward_std": 0.6841486990451813, | |
| "rewards/equation_reward_func": 1.3750000298023224, | |
| "rewards/format_reward_func": 2.04090279340744, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 648.5625, | |
| "epoch": 0.0701468189233279, | |
| "grad_norm": 1.063650258930375, | |
| "kl": 0.15478515625, | |
| "learning_rate": 4.2042213882268025e-07, | |
| "loss": 0.0002, | |
| "reward": 2.6829168796539307, | |
| "reward_std": 0.8894191086292267, | |
| "rewards/equation_reward_func": 0.5416666865348816, | |
| "rewards/format_reward_func": 2.1412501335144043, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 561.6666870117188, | |
| "epoch": 0.07055464926590538, | |
| "grad_norm": 1.457579727519852, | |
| "kl": 0.17724609375, | |
| "learning_rate": 4.149857708648535e-07, | |
| "loss": 0.0002, | |
| "reward": 2.8364583253860474, | |
| "reward_std": 0.47896429151296616, | |
| "rewards/equation_reward_func": 0.6666666865348816, | |
| "rewards/format_reward_func": 2.1697916984558105, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 642.3541870117188, | |
| "epoch": 0.07096247960848287, | |
| "grad_norm": 1.3912364072475296, | |
| "kl": 0.16064453125, | |
| "learning_rate": 4.095597328339452e-07, | |
| "loss": 0.0002, | |
| "reward": 2.601736068725586, | |
| "reward_std": 0.6701973676681519, | |
| "rewards/equation_reward_func": 0.3333333544433117, | |
| "rewards/format_reward_func": 2.2684028148651123, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 565.8541870117188, | |
| "epoch": 0.07137030995106036, | |
| "grad_norm": 1.4335488442307787, | |
| "kl": 0.15869140625, | |
| "learning_rate": 4.041446840381309e-07, | |
| "loss": 0.0002, | |
| "reward": 2.8512500524520874, | |
| "reward_std": 0.6541395485401154, | |
| "rewards/equation_reward_func": 0.7916666865348816, | |
| "rewards/format_reward_func": 2.0595834851264954, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 592.4583435058594, | |
| "epoch": 0.07177814029363784, | |
| "grad_norm": 1.6711244477487293, | |
| "kl": 0.1640625, | |
| "learning_rate": 3.98741282450304e-07, | |
| "loss": 0.0002, | |
| "reward": 2.683958411216736, | |
| "reward_std": 0.9120919704437256, | |
| "rewards/equation_reward_func": 0.5833333730697632, | |
| "rewards/format_reward_func": 2.1006250977516174, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 647.5625305175781, | |
| "epoch": 0.07218597063621533, | |
| "grad_norm": 1.3418439173813352, | |
| "kl": 0.1611328125, | |
| "learning_rate": 3.9335018462812664e-07, | |
| "loss": 0.0002, | |
| "reward": 2.5612502098083496, | |
| "reward_std": 0.9770323932170868, | |
| "rewards/equation_reward_func": 0.5, | |
| "rewards/format_reward_func": 2.06125009059906, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 593.5, | |
| "epoch": 0.07259380097879282, | |
| "grad_norm": 0.8818594196791458, | |
| "kl": 0.15478515625, | |
| "learning_rate": 3.879720456342521e-07, | |
| "loss": 0.0002, | |
| "reward": 2.828229308128357, | |
| "reward_std": 0.7171844244003296, | |
| "rewards/equation_reward_func": 0.7083333730697632, | |
| "rewards/format_reward_func": 2.119895815849304, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 458.3125, | |
| "epoch": 0.07300163132137032, | |
| "grad_norm": 1.4981060215710775, | |
| "kl": 0.1640625, | |
| "learning_rate": 3.8260751895672954e-07, | |
| "loss": 0.0002, | |
| "reward": 3.318923592567444, | |
| "reward_std": 0.8009838759899139, | |
| "rewards/equation_reward_func": 1.3333333432674408, | |
| "rewards/format_reward_func": 1.9855904579162598, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 455.9166717529297, | |
| "epoch": 0.0734094616639478, | |
| "grad_norm": 1.3586016748782879, | |
| "kl": 0.18408203125, | |
| "learning_rate": 3.772572564296004e-07, | |
| "loss": 0.0002, | |
| "reward": 3.7097569704055786, | |
| "reward_std": 0.5928686857223511, | |
| "rewards/equation_reward_func": 1.7083333730697632, | |
| "rewards/format_reward_func": 2.001423716545105, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 673.6875305175781, | |
| "epoch": 0.07381729200652529, | |
| "grad_norm": 1.2301932410337926, | |
| "kl": 0.1787109375, | |
| "learning_rate": 3.719219081536942e-07, | |
| "loss": 0.0002, | |
| "reward": 2.9322917461395264, | |
| "reward_std": 1.2178776860237122, | |
| "rewards/equation_reward_func": 0.8333333730697632, | |
| "rewards/format_reward_func": 2.098958373069763, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 621.8333435058594, | |
| "epoch": 0.07422512234910278, | |
| "grad_norm": 1.2515697084532456, | |
| "kl": 0.15673828125, | |
| "learning_rate": 3.666021224176369e-07, | |
| "loss": 0.0002, | |
| "reward": 3.1526390314102173, | |
| "reward_std": 1.0030421912670135, | |
| "rewards/equation_reward_func": 0.9166666865348816, | |
| "rewards/format_reward_func": 2.235972285270691, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 615.0000305175781, | |
| "epoch": 0.07463295269168026, | |
| "grad_norm": 1.5169652601615895, | |
| "kl": 0.19091796875, | |
| "learning_rate": 3.612985456190778e-07, | |
| "loss": 0.0002, | |
| "reward": 2.936007022857666, | |
| "reward_std": 0.7388836741447449, | |
| "rewards/equation_reward_func": 0.875, | |
| "rewards/format_reward_func": 2.0610069632530212, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 592.9583435058594, | |
| "epoch": 0.07504078303425775, | |
| "grad_norm": 1.4869853846405452, | |
| "kl": 0.1611328125, | |
| "learning_rate": 3.56011822186147e-07, | |
| "loss": 0.0002, | |
| "reward": 3.217986226081848, | |
| "reward_std": 0.7202288508415222, | |
| "rewards/equation_reward_func": 1.0416666716337204, | |
| "rewards/format_reward_func": 2.1763195991516113, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 696.0, | |
| "epoch": 0.07544861337683524, | |
| "grad_norm": 1.3749187528497846, | |
| "kl": 0.20947265625, | |
| "learning_rate": 3.507425944991528e-07, | |
| "loss": 0.0002, | |
| "reward": 2.269930601119995, | |
| "reward_std": 0.710529625415802, | |
| "rewards/equation_reward_func": 0.0833333358168602, | |
| "rewards/format_reward_func": 2.186597466468811, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 557.3125305175781, | |
| "epoch": 0.07585644371941272, | |
| "grad_norm": 1.4000574837745792, | |
| "kl": 0.16796875, | |
| "learning_rate": 3.454915028125263e-07, | |
| "loss": 0.0002, | |
| "reward": 3.027916669845581, | |
| "reward_std": 1.0261832475662231, | |
| "rewards/equation_reward_func": 1.0416666865348816, | |
| "rewards/format_reward_func": 1.9862500429153442, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 493.1875, | |
| "epoch": 0.07626427406199021, | |
| "grad_norm": 1.3346862160598578, | |
| "kl": 0.17724609375, | |
| "learning_rate": 3.4025918517702593e-07, | |
| "loss": 0.0002, | |
| "reward": 3.429097294807434, | |
| "reward_std": 0.7599293291568756, | |
| "rewards/equation_reward_func": 1.4166666865348816, | |
| "rewards/format_reward_func": 2.012430787086487, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 559.7916717529297, | |
| "epoch": 0.0766721044045677, | |
| "grad_norm": 1.674692584956449, | |
| "kl": 0.25830078125, | |
| "learning_rate": 3.3504627736220857e-07, | |
| "loss": 0.0003, | |
| "reward": 3.2719098329544067, | |
| "reward_std": 0.7151365131139755, | |
| "rewards/equation_reward_func": 1.2083333730697632, | |
| "rewards/format_reward_func": 2.063576579093933, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 504.3125, | |
| "epoch": 0.07707993474714518, | |
| "grad_norm": 1.5590357023489072, | |
| "kl": 0.21337890625, | |
| "learning_rate": 3.2985341277917846e-07, | |
| "loss": 0.0002, | |
| "reward": 2.8944443464279175, | |
| "reward_std": 0.7131877541542053, | |
| "rewards/equation_reward_func": 0.8750000409781933, | |
| "rewards/format_reward_func": 2.0194445848464966, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 735.75, | |
| "epoch": 0.07748776508972267, | |
| "grad_norm": 1.068431271506457, | |
| "kl": 0.1865234375, | |
| "learning_rate": 3.2468122240362285e-07, | |
| "loss": 0.0002, | |
| "reward": 2.7177083492279053, | |
| "reward_std": 0.7962678074836731, | |
| "rewards/equation_reward_func": 0.4166666865348816, | |
| "rewards/format_reward_func": 2.301041841506958, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 796.8541870117188, | |
| "epoch": 0.07789559543230017, | |
| "grad_norm": 1.1087256773204297, | |
| "kl": 0.162109375, | |
| "learning_rate": 3.195303346991427e-07, | |
| "loss": 0.0002, | |
| "reward": 2.0952779054641724, | |
| "reward_std": 0.8598673939704895, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 2.0952779054641724, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 570.0625152587891, | |
| "epoch": 0.07830342577487764, | |
| "grad_norm": 1.4382086945561208, | |
| "kl": 0.17578125, | |
| "learning_rate": 3.1440137554088953e-07, | |
| "loss": 0.0002, | |
| "reward": 2.951319456100464, | |
| "reward_std": 0.7555558383464813, | |
| "rewards/equation_reward_func": 0.875, | |
| "rewards/format_reward_func": 2.076319396495819, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 545.7291870117188, | |
| "epoch": 0.07871125611745514, | |
| "grad_norm": 1.1023634840652674, | |
| "kl": 0.20166015625, | |
| "learning_rate": 3.092949681395169e-07, | |
| "loss": 0.0002, | |
| "reward": 3.21753466129303, | |
| "reward_std": 1.1398820281028748, | |
| "rewards/equation_reward_func": 1.3333333730697632, | |
| "rewards/format_reward_func": 1.8842013478279114, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 540.5625152587891, | |
| "epoch": 0.07911908646003263, | |
| "grad_norm": 1.6785853811175349, | |
| "kl": 0.17529296875, | |
| "learning_rate": 3.042117329654544e-07, | |
| "loss": 0.0002, | |
| "reward": 3.2422919273376465, | |
| "reward_std": 0.8742612600326538, | |
| "rewards/equation_reward_func": 1.0833333432674408, | |
| "rewards/format_reward_func": 2.1589584350585938, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 437.12501525878906, | |
| "epoch": 0.07952691680261012, | |
| "grad_norm": 1.719678147524432, | |
| "kl": 0.25732421875, | |
| "learning_rate": 2.9915228767351535e-07, | |
| "loss": 0.0003, | |
| "reward": 3.636701464653015, | |
| "reward_std": 0.723703920841217, | |
| "rewards/equation_reward_func": 1.6666666865348816, | |
| "rewards/format_reward_func": 1.970034897327423, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 665.4375305175781, | |
| "epoch": 0.0799347471451876, | |
| "grad_norm": 1.2489486438524378, | |
| "kl": 0.2080078125, | |
| "learning_rate": 2.941172470278476e-07, | |
| "loss": 0.0002, | |
| "reward": 2.668472409248352, | |
| "reward_std": 0.8615556359291077, | |
| "rewards/equation_reward_func": 0.541666679084301, | |
| "rewards/format_reward_func": 2.1268056631088257, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 816.0208435058594, | |
| "epoch": 0.08034257748776509, | |
| "grad_norm": 0.947304383944691, | |
| "kl": 0.18359375, | |
| "learning_rate": 2.89107222827234e-07, | |
| "loss": 0.0002, | |
| "reward": 2.104305624961853, | |
| "reward_std": 0.866163969039917, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 2.1043055057525635, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 765.4791870117188, | |
| "epoch": 0.08075040783034258, | |
| "grad_norm": 1.400028124019858, | |
| "kl": 0.19287109375, | |
| "learning_rate": 2.841228238307536e-07, | |
| "loss": 0.0002, | |
| "reward": 2.2613543272018433, | |
| "reward_std": 0.5083828084170818, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 2.2613543272018433, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 781.6250305175781, | |
| "epoch": 0.08115823817292006, | |
| "grad_norm": 1.0827871893574141, | |
| "kl": 0.1767578125, | |
| "learning_rate": 2.79164655683813e-07, | |
| "loss": 0.0002, | |
| "reward": 2.1505903601646423, | |
| "reward_std": 0.7696040868759155, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 2.1505903005599976, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 493.43751525878906, | |
| "epoch": 0.08156606851549755, | |
| "grad_norm": 1.2921133198533294, | |
| "kl": 0.791015625, | |
| "learning_rate": 2.742333208445554e-07, | |
| "loss": 0.0008, | |
| "reward": 3.811944603919983, | |
| "reward_std": 0.5879083275794983, | |
| "rewards/equation_reward_func": 1.7500000596046448, | |
| "rewards/format_reward_func": 2.061944365501404, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 746.0625305175781, | |
| "epoch": 0.08197389885807504, | |
| "grad_norm": 1.305143762318282, | |
| "kl": 0.17529296875, | |
| "learning_rate": 2.6932941851065615e-07, | |
| "loss": 0.0002, | |
| "reward": 2.444791793823242, | |
| "reward_std": 0.4394510090351105, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 2.4031251668930054, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 544.4791717529297, | |
| "epoch": 0.08238172920065252, | |
| "grad_norm": 1.1953377442351993, | |
| "kl": 0.1748046875, | |
| "learning_rate": 2.6445354454651636e-07, | |
| "loss": 0.0002, | |
| "reward": 3.572013735771179, | |
| "reward_std": 0.8854174613952637, | |
| "rewards/equation_reward_func": 1.4583333730697632, | |
| "rewards/format_reward_func": 2.113680601119995, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 804.7708740234375, | |
| "epoch": 0.08278955954323002, | |
| "grad_norm": 0.9435020676134209, | |
| "kl": 0.2001953125, | |
| "learning_rate": 2.596062914108601e-07, | |
| "loss": 0.0002, | |
| "reward": 2.238854169845581, | |
| "reward_std": 0.7229233682155609, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 2.238854169845581, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 771.4166870117188, | |
| "epoch": 0.08319738988580751, | |
| "grad_norm": 1.0179706632601526, | |
| "kl": 0.171875, | |
| "learning_rate": 2.547882480847461e-07, | |
| "loss": 0.0002, | |
| "reward": 2.2952778339385986, | |
| "reward_std": 0.6089069843292236, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 2.2952778339385986, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 615.0625, | |
| "epoch": 0.08360522022838499, | |
| "grad_norm": 1.3095508272095453, | |
| "kl": 0.2060546875, | |
| "learning_rate": 2.500000000000001e-07, | |
| "loss": 0.0002, | |
| "reward": 2.916632056236267, | |
| "reward_std": 1.1873834133148193, | |
| "rewards/equation_reward_func": 0.875, | |
| "rewards/format_reward_func": 2.041632056236267, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 522.5833587646484, | |
| "epoch": 0.08401305057096248, | |
| "grad_norm": 1.3099266975063921, | |
| "kl": 0.1826171875, | |
| "learning_rate": 2.452421289680826e-07, | |
| "loss": 0.0002, | |
| "reward": 3.69243061542511, | |
| "reward_std": 0.7064912915229797, | |
| "rewards/equation_reward_func": 1.5833333730697632, | |
| "rewards/format_reward_func": 2.1090973615646362, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 554.5625305175781, | |
| "epoch": 0.08442088091353997, | |
| "grad_norm": 1.3331866727652546, | |
| "kl": 0.18212890625, | |
| "learning_rate": 2.4051521310939254e-07, | |
| "loss": 0.0002, | |
| "reward": 3.463784694671631, | |
| "reward_std": 0.8228816390037537, | |
| "rewards/equation_reward_func": 1.291666716337204, | |
| "rewards/format_reward_func": 2.1721181869506836, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 422.85418701171875, | |
| "epoch": 0.08482871125611746, | |
| "grad_norm": 1.4824642737659157, | |
| "kl": 0.171875, | |
| "learning_rate": 2.3581982678302058e-07, | |
| "loss": 0.0002, | |
| "reward": 3.8071876764297485, | |
| "reward_std": 0.43291839957237244, | |
| "rewards/equation_reward_func": 1.8750000596046448, | |
| "rewards/format_reward_func": 1.9321874976158142, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 674.2916870117188, | |
| "epoch": 0.08523654159869494, | |
| "grad_norm": 1.5561510539962522, | |
| "kl": 0.19189453125, | |
| "learning_rate": 2.3115654051696092e-07, | |
| "loss": 0.0002, | |
| "reward": 2.5582985877990723, | |
| "reward_std": 0.8232472538948059, | |
| "rewards/equation_reward_func": 0.3333333432674408, | |
| "rewards/format_reward_func": 2.2249653339385986, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 710.0208435058594, | |
| "epoch": 0.08564437194127243, | |
| "grad_norm": 1.2191039991941799, | |
| "kl": 0.205078125, | |
| "learning_rate": 2.2652592093878665e-07, | |
| "loss": 0.0002, | |
| "reward": 2.6046180725097656, | |
| "reward_std": 1.0610361099243164, | |
| "rewards/equation_reward_func": 0.4583333544433117, | |
| "rewards/format_reward_func": 2.146284818649292, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 609.6250305175781, | |
| "epoch": 0.08605220228384992, | |
| "grad_norm": 1.5426998553595659, | |
| "kl": 0.38818359375, | |
| "learning_rate": 2.2192853070679967e-07, | |
| "loss": 0.0004, | |
| "reward": 2.992326498031616, | |
| "reward_std": 0.6148561537265778, | |
| "rewards/equation_reward_func": 0.7916666865348816, | |
| "rewards/format_reward_func": 2.2006598711013794, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 603.0833587646484, | |
| "epoch": 0.0864600326264274, | |
| "grad_norm": 1.1676070955306959, | |
| "kl": 0.17626953125, | |
| "learning_rate": 2.1736492844166404e-07, | |
| "loss": 0.0002, | |
| "reward": 2.8410418033599854, | |
| "reward_std": 0.7288043797016144, | |
| "rewards/equation_reward_func": 0.75, | |
| "rewards/format_reward_func": 2.091041922569275, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 686.1041870117188, | |
| "epoch": 0.0868678629690049, | |
| "grad_norm": 1.2093809230932122, | |
| "kl": 0.16259765625, | |
| "learning_rate": 2.128356686585282e-07, | |
| "loss": 0.0002, | |
| "reward": 2.547639012336731, | |
| "reward_std": 1.0525963008403778, | |
| "rewards/equation_reward_func": 0.4583333544433117, | |
| "rewards/format_reward_func": 2.089305877685547, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 528.4166870117188, | |
| "epoch": 0.08727569331158239, | |
| "grad_norm": 1.493705154956843, | |
| "kl": 0.19921875, | |
| "learning_rate": 2.0834130169964692e-07, | |
| "loss": 0.0002, | |
| "reward": 3.80138897895813, | |
| "reward_std": 0.7043006718158722, | |
| "rewards/equation_reward_func": 1.6666667461395264, | |
| "rewards/format_reward_func": 2.1347222328186035, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 595.5416870117188, | |
| "epoch": 0.08768352365415986, | |
| "grad_norm": 1.0806302437363526, | |
| "kl": 0.162109375, | |
| "learning_rate": 2.0388237366751003e-07, | |
| "loss": 0.0002, | |
| "reward": 3.173958420753479, | |
| "reward_std": 1.080767273902893, | |
| "rewards/equation_reward_func": 1.125, | |
| "rewards/format_reward_func": 2.0489583611488342, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 461.81251525878906, | |
| "epoch": 0.08809135399673736, | |
| "grad_norm": 1.3305612303160375, | |
| "kl": 0.2109375, | |
| "learning_rate": 1.9945942635848745e-07, | |
| "loss": 0.0002, | |
| "reward": 3.8538542985916138, | |
| "reward_std": 0.5665659308433533, | |
| "rewards/equation_reward_func": 1.8333333730697632, | |
| "rewards/format_reward_func": 2.0205209255218506, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 655.4375, | |
| "epoch": 0.08849918433931485, | |
| "grad_norm": 1.5166587801432987, | |
| "kl": 0.23876953125, | |
| "learning_rate": 1.950729971969955e-07, | |
| "loss": 0.0002, | |
| "reward": 2.8268750309944153, | |
| "reward_std": 0.879076361656189, | |
| "rewards/equation_reward_func": 0.7916666865348816, | |
| "rewards/format_reward_func": 2.0352084040641785, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 768.7083435058594, | |
| "epoch": 0.08890701468189233, | |
| "grad_norm": 1.3585261445866479, | |
| "kl": 0.15380859375, | |
| "learning_rate": 1.9072361917019536e-07, | |
| "loss": 0.0002, | |
| "reward": 2.529687523841858, | |
| "reward_std": 0.5843232274055481, | |
| "rewards/equation_reward_func": 0.125, | |
| "rewards/format_reward_func": 2.4046876430511475, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 642.6458435058594, | |
| "epoch": 0.08931484502446982, | |
| "grad_norm": 1.2793508732682657, | |
| "kl": 0.18505859375, | |
| "learning_rate": 1.8641182076323148e-07, | |
| "loss": 0.0002, | |
| "reward": 3.2028820514678955, | |
| "reward_std": 0.9111791253089905, | |
| "rewards/equation_reward_func": 0.8750000298023224, | |
| "rewards/format_reward_func": 2.327882170677185, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 590.1041870117188, | |
| "epoch": 0.08972267536704731, | |
| "grad_norm": 1.243315094285279, | |
| "kl": 0.1865234375, | |
| "learning_rate": 1.8213812589501608e-07, | |
| "loss": 0.0002, | |
| "reward": 3.199236273765564, | |
| "reward_std": 0.7947587668895721, | |
| "rewards/equation_reward_func": 1.083333358168602, | |
| "rewards/format_reward_func": 2.115902900695801, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 667.2083435058594, | |
| "epoch": 0.0901305057096248, | |
| "grad_norm": 1.4563035634925794, | |
| "kl": 0.15966796875, | |
| "learning_rate": 1.7790305385456795e-07, | |
| "loss": 0.0002, | |
| "reward": 2.744722366333008, | |
| "reward_std": 0.8301202952861786, | |
| "rewards/equation_reward_func": 0.5416666865348816, | |
| "rewards/format_reward_func": 2.2030556201934814, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 628.7916870117188, | |
| "epoch": 0.09053833605220228, | |
| "grad_norm": 1.5549898799881567, | |
| "kl": 0.18359375, | |
| "learning_rate": 1.7370711923791564e-07, | |
| "loss": 0.0002, | |
| "reward": 2.978472352027893, | |
| "reward_std": 0.7657686173915863, | |
| "rewards/equation_reward_func": 0.875, | |
| "rewards/format_reward_func": 2.1034722328186035, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 589.9375152587891, | |
| "epoch": 0.09094616639477977, | |
| "grad_norm": 1.3961044134596396, | |
| "kl": 0.16455078125, | |
| "learning_rate": 1.6955083188556946e-07, | |
| "loss": 0.0002, | |
| "reward": 2.9413541555404663, | |
| "reward_std": 0.869944304227829, | |
| "rewards/equation_reward_func": 0.75, | |
| "rewards/format_reward_func": 2.191354274749756, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 588.5625305175781, | |
| "epoch": 0.09135399673735727, | |
| "grad_norm": 1.2868775692231729, | |
| "kl": 0.16845703125, | |
| "learning_rate": 1.6543469682057104e-07, | |
| "loss": 0.0002, | |
| "reward": 3.0225348472595215, | |
| "reward_std": 0.6649808585643768, | |
| "rewards/equation_reward_func": 0.9166666679084301, | |
| "rewards/format_reward_func": 2.105868101119995, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 619.4791717529297, | |
| "epoch": 0.09176182707993474, | |
| "grad_norm": 1.3540986287392276, | |
| "kl": 0.234375, | |
| "learning_rate": 1.6135921418712955e-07, | |
| "loss": 0.0002, | |
| "reward": 3.06413197517395, | |
| "reward_std": 0.5509577691555023, | |
| "rewards/equation_reward_func": 0.9166666865348816, | |
| "rewards/format_reward_func": 2.147465467453003, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 643.9166870117188, | |
| "epoch": 0.09216965742251224, | |
| "grad_norm": 1.3600242782766807, | |
| "kl": 0.169921875, | |
| "learning_rate": 1.5732487918985015e-07, | |
| "loss": 0.0002, | |
| "reward": 2.942257046699524, | |
| "reward_std": 0.6768557727336884, | |
| "rewards/equation_reward_func": 0.6666666865348816, | |
| "rewards/format_reward_func": 2.275590419769287, | |
| "step": 452 | |
| }, | |
| { | |
| "completion_length": 748.5625305175781, | |
| "epoch": 0.09257748776508973, | |
| "grad_norm": 1.0093428243067948, | |
| "kl": 0.15673828125, | |
| "learning_rate": 1.533321820335624e-07, | |
| "loss": 0.0002, | |
| "reward": 2.5672223567962646, | |
| "reward_std": 0.9604451656341553, | |
| "rewards/equation_reward_func": 0.3333333432674408, | |
| "rewards/format_reward_func": 2.233889102935791, | |
| "step": 454 | |
| }, | |
| { | |
| "completion_length": 631.9791870117188, | |
| "epoch": 0.0929853181076672, | |
| "grad_norm": 1.4428676303535133, | |
| "kl": 0.17236328125, | |
| "learning_rate": 1.493816078637557e-07, | |
| "loss": 0.0002, | |
| "reward": 3.1071181297302246, | |
| "reward_std": 0.5816036462783813, | |
| "rewards/equation_reward_func": 0.875, | |
| "rewards/format_reward_func": 2.232118010520935, | |
| "step": 456 | |
| }, | |
| { | |
| "completion_length": 745.5625, | |
| "epoch": 0.0933931484502447, | |
| "grad_norm": 0.9921529304927418, | |
| "kl": 0.16748046875, | |
| "learning_rate": 1.4547363670763136e-07, | |
| "loss": 0.0002, | |
| "reward": 2.374131917953491, | |
| "reward_std": 0.40968185663223267, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 2.374131917953491, | |
| "step": 458 | |
| }, | |
| { | |
| "completion_length": 705.0833740234375, | |
| "epoch": 0.09380097879282219, | |
| "grad_norm": 1.0713444392216345, | |
| "kl": 0.18310546875, | |
| "learning_rate": 1.4160874341577444e-07, | |
| "loss": 0.0002, | |
| "reward": 2.8997570276260376, | |
| "reward_std": 0.7406170666217804, | |
| "rewards/equation_reward_func": 0.5833333730697632, | |
| "rewards/format_reward_func": 2.3164236545562744, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 635.0000305175781, | |
| "epoch": 0.09420880913539967, | |
| "grad_norm": 1.3688348766889487, | |
| "kl": 0.18115234375, | |
| "learning_rate": 1.3778739760445552e-07, | |
| "loss": 0.0002, | |
| "reward": 2.9428473711013794, | |
| "reward_std": 0.6385838389396667, | |
| "rewards/equation_reward_func": 0.7083333730697632, | |
| "rewards/format_reward_func": 2.234513998031616, | |
| "step": 462 | |
| }, | |
| { | |
| "completion_length": 637.1458435058594, | |
| "epoch": 0.09461663947797716, | |
| "grad_norm": 1.07823940444348, | |
| "kl": 0.1787109375, | |
| "learning_rate": 1.3401006359856916e-07, | |
| "loss": 0.0002, | |
| "reward": 3.116041660308838, | |
| "reward_std": 0.6116780638694763, | |
| "rewards/equation_reward_func": 0.9166666865348816, | |
| "rewards/format_reward_func": 2.1993749141693115, | |
| "step": 464 | |
| }, | |
| { | |
| "completion_length": 671.1041870117188, | |
| "epoch": 0.09502446982055465, | |
| "grad_norm": 1.0251069650416453, | |
| "kl": 0.1611328125, | |
| "learning_rate": 1.3027720037521395e-07, | |
| "loss": 0.0002, | |
| "reward": 3.0604861974716187, | |
| "reward_std": 0.6963988989591599, | |
| "rewards/equation_reward_func": 0.9583333730697632, | |
| "rewards/format_reward_func": 2.1021528244018555, | |
| "step": 466 | |
| }, | |
| { | |
| "completion_length": 508.33335876464844, | |
| "epoch": 0.09543230016313213, | |
| "grad_norm": 1.2041214798556659, | |
| "kl": 0.1513671875, | |
| "learning_rate": 1.2658926150792322e-07, | |
| "loss": 0.0002, | |
| "reward": 3.716770887374878, | |
| "reward_std": 0.8307555913925171, | |
| "rewards/equation_reward_func": 1.6666666865348816, | |
| "rewards/format_reward_func": 2.050104081630707, | |
| "step": 468 | |
| }, | |
| { | |
| "completion_length": 702.5625, | |
| "epoch": 0.09584013050570962, | |
| "grad_norm": 1.4167156857768421, | |
| "kl": 0.171875, | |
| "learning_rate": 1.229466951115519e-07, | |
| "loss": 0.0002, | |
| "reward": 2.8170487880706787, | |
| "reward_std": 0.932531863451004, | |
| "rewards/equation_reward_func": 0.6666666865348816, | |
| "rewards/format_reward_func": 2.1503820419311523, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 440.3958435058594, | |
| "epoch": 0.09624796084828711, | |
| "grad_norm": 1.2804118870894239, | |
| "kl": 0.205078125, | |
| "learning_rate": 1.193499437878277e-07, | |
| "loss": 0.0002, | |
| "reward": 3.693055510520935, | |
| "reward_std": 0.6622753441333771, | |
| "rewards/equation_reward_func": 1.7083333730697632, | |
| "rewards/format_reward_func": 1.9847222566604614, | |
| "step": 472 | |
| }, | |
| { | |
| "completion_length": 654.7291870117188, | |
| "epoch": 0.0966557911908646, | |
| "grad_norm": 1.4440438187505384, | |
| "kl": 0.20458984375, | |
| "learning_rate": 1.1579944457157059e-07, | |
| "loss": 0.0002, | |
| "reward": 2.8649654388427734, | |
| "reward_std": 0.7565539479255676, | |
| "rewards/equation_reward_func": 0.6666666865348816, | |
| "rewards/format_reward_func": 2.198298692703247, | |
| "step": 474 | |
| }, | |
| { | |
| "completion_length": 618.1250305175781, | |
| "epoch": 0.09706362153344208, | |
| "grad_norm": 1.2861995217349655, | |
| "kl": 0.16748046875, | |
| "learning_rate": 1.1229562887758925e-07, | |
| "loss": 0.0002, | |
| "reward": 2.932604193687439, | |
| "reward_std": 0.7926245033740997, | |
| "rewards/equation_reward_func": 0.7916666679084301, | |
| "rewards/format_reward_func": 2.1409374475479126, | |
| "step": 476 | |
| }, | |
| { | |
| "completion_length": 566.7083435058594, | |
| "epoch": 0.09747145187601958, | |
| "grad_norm": 1.2073113401945212, | |
| "kl": 0.1943359375, | |
| "learning_rate": 1.088389224482617e-07, | |
| "loss": 0.0002, | |
| "reward": 3.6285417079925537, | |
| "reward_std": 0.7963749468326569, | |
| "rewards/equation_reward_func": 1.4583333730697632, | |
| "rewards/format_reward_func": 2.170208215713501, | |
| "step": 478 | |
| }, | |
| { | |
| "completion_length": 616.9791870117188, | |
| "epoch": 0.09787928221859707, | |
| "grad_norm": 3.5720337342156494, | |
| "kl": 0.1904296875, | |
| "learning_rate": 1.0542974530180327e-07, | |
| "loss": 0.0002, | |
| "reward": 3.047569513320923, | |
| "reward_std": 0.9450699985027313, | |
| "rewards/equation_reward_func": 0.833333358168602, | |
| "rewards/format_reward_func": 2.2142361402511597, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 656.5833435058594, | |
| "epoch": 0.09828711256117455, | |
| "grad_norm": 1.3363621901974427, | |
| "kl": 0.1962890625, | |
| "learning_rate": 1.0206851168123076e-07, | |
| "loss": 0.0002, | |
| "reward": 2.9283682107925415, | |
| "reward_std": 0.7495492100715637, | |
| "rewards/equation_reward_func": 0.7083333730697632, | |
| "rewards/format_reward_func": 2.2200348377227783, | |
| "step": 482 | |
| }, | |
| { | |
| "completion_length": 584.3125152587891, | |
| "epoch": 0.09869494290375204, | |
| "grad_norm": 1.235985667420378, | |
| "kl": 0.18017578125, | |
| "learning_rate": 9.875563000402948e-08, | |
| "loss": 0.0002, | |
| "reward": 3.0726042985916138, | |
| "reward_std": 0.9599539935588837, | |
| "rewards/equation_reward_func": 0.958333358168602, | |
| "rewards/format_reward_func": 2.1142709255218506, | |
| "step": 484 | |
| }, | |
| { | |
| "completion_length": 602.3958435058594, | |
| "epoch": 0.09910277324632953, | |
| "grad_norm": 1.352419507127413, | |
| "kl": 0.18310546875, | |
| "learning_rate": 9.549150281252632e-08, | |
| "loss": 0.0002, | |
| "reward": 3.627708315849304, | |
| "reward_std": 0.8821892440319061, | |
| "rewards/equation_reward_func": 1.4166666865348816, | |
| "rewards/format_reward_func": 2.211041808128357, | |
| "step": 486 | |
| }, | |
| { | |
| "completion_length": 684.2916870117188, | |
| "epoch": 0.09951060358890701, | |
| "grad_norm": 1.406310592245728, | |
| "kl": 0.181640625, | |
| "learning_rate": 9.22765267249776e-08, | |
| "loss": 0.0002, | |
| "reward": 2.766076445579529, | |
| "reward_std": 0.6145432703197002, | |
| "rewards/equation_reward_func": 0.4583333432674408, | |
| "rewards/format_reward_func": 2.3077430725097656, | |
| "step": 488 | |
| }, | |
| { | |
| "completion_length": 657.7916870117188, | |
| "epoch": 0.0999184339314845, | |
| "grad_norm": 1.3318403971159751, | |
| "kl": 0.2080078125, | |
| "learning_rate": 8.911109238737747e-08, | |
| "loss": 0.0002, | |
| "reward": 3.1873958110809326, | |
| "reward_std": 0.37190073914825916, | |
| "rewards/equation_reward_func": 0.8333333730697632, | |
| "rewards/format_reward_func": 2.354062557220459, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 756.6041870117188, | |
| "epoch": 0.100326264274062, | |
| "grad_norm": 1.4061978494714438, | |
| "kl": 0.20703125, | |
| "learning_rate": 8.599558442598998e-08, | |
| "loss": 0.0002, | |
| "reward": 2.171909749507904, | |
| "reward_std": 0.6123473569750786, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 2.171909749507904, | |
| "step": 492 | |
| }, | |
| { | |
| "completion_length": 590.2916717529297, | |
| "epoch": 0.10073409461663947, | |
| "grad_norm": 1.17520751465437, | |
| "kl": 0.19677734375, | |
| "learning_rate": 8.293038140061515e-08, | |
| "loss": 0.0002, | |
| "reward": 3.068298816680908, | |
| "reward_std": 0.5875828564167023, | |
| "rewards/equation_reward_func": 0.9166666865348816, | |
| "rewards/format_reward_func": 2.151632070541382, | |
| "step": 494 | |
| }, | |
| { | |
| "completion_length": 689.9166870117188, | |
| "epoch": 0.10114192495921696, | |
| "grad_norm": 1.329822562589213, | |
| "kl": 0.19287109375, | |
| "learning_rate": 7.991585575858961e-08, | |
| "loss": 0.0002, | |
| "reward": 2.7232291102409363, | |
| "reward_std": 0.9429112374782562, | |
| "rewards/equation_reward_func": 0.625, | |
| "rewards/format_reward_func": 2.0982291102409363, | |
| "step": 496 | |
| }, | |
| { | |
| "completion_length": 644.4791717529297, | |
| "epoch": 0.10154975530179446, | |
| "grad_norm": 1.7033208401953535, | |
| "kl": 0.17529296875, | |
| "learning_rate": 7.695237378953224e-08, | |
| "loss": 0.0002, | |
| "reward": 2.9771876335144043, | |
| "reward_std": 0.73157799243927, | |
| "rewards/equation_reward_func": 0.7916666865348816, | |
| "rewards/format_reward_func": 2.1855210065841675, | |
| "step": 498 | |
| }, | |
| { | |
| "completion_length": 737.6041870117188, | |
| "epoch": 0.10195758564437195, | |
| "grad_norm": 1.713508657462858, | |
| "kl": 0.18017578125, | |
| "learning_rate": 7.404029558083652e-08, | |
| "loss": 0.0002, | |
| "reward": 2.4721529483795166, | |
| "reward_std": 0.4938492923974991, | |
| "rewards/equation_reward_func": 0.0833333358168602, | |
| "rewards/format_reward_func": 2.388819456100464, | |
| "step": 500 | |
| }, | |
| { | |
| "completion_length": 665.3333740234375, | |
| "epoch": 0.10236541598694943, | |
| "grad_norm": 1.0340163471089046, | |
| "kl": 0.18701171875, | |
| "learning_rate": 7.117997497391648e-08, | |
| "loss": 0.0002, | |
| "reward": 2.9871530532836914, | |
| "reward_std": 0.7464583814144135, | |
| "rewards/equation_reward_func": 0.7916666865348816, | |
| "rewards/format_reward_func": 2.195486307144165, | |
| "step": 502 | |
| }, | |
| { | |
| "completion_length": 661.5625305175781, | |
| "epoch": 0.10277324632952692, | |
| "grad_norm": 1.0282672590274708, | |
| "kl": 0.17529296875, | |
| "learning_rate": 6.837175952121304e-08, | |
| "loss": 0.0002, | |
| "reward": 2.9892709255218506, | |
| "reward_std": 0.6545587778091431, | |
| "rewards/equation_reward_func": 0.7083333730697632, | |
| "rewards/format_reward_func": 2.2809375524520874, | |
| "step": 504 | |
| }, | |
| { | |
| "completion_length": 788.7500305175781, | |
| "epoch": 0.10318107667210441, | |
| "grad_norm": 1.2155250881363209, | |
| "kl": 0.1943359375, | |
| "learning_rate": 6.561599044396288e-08, | |
| "loss": 0.0002, | |
| "reward": 2.4754514694213867, | |
| "reward_std": 1.2856568098068237, | |
| "rewards/equation_reward_func": 0.5, | |
| "rewards/format_reward_func": 1.9754514694213867, | |
| "step": 506 | |
| }, | |
| { | |
| "completion_length": 689.5833435058594, | |
| "epoch": 0.10358890701468189, | |
| "grad_norm": 1.1923299231333921, | |
| "kl": 0.19775390625, | |
| "learning_rate": 6.291300259073722e-08, | |
| "loss": 0.0002, | |
| "reward": 2.958611249923706, | |
| "reward_std": 0.8271161913871765, | |
| "rewards/equation_reward_func": 0.8333333730697632, | |
| "rewards/format_reward_func": 2.125277876853943, | |
| "step": 508 | |
| }, | |
| { | |
| "completion_length": 637.3958435058594, | |
| "epoch": 0.10399673735725938, | |
| "grad_norm": 1.594736749913249, | |
| "kl": 0.2021484375, | |
| "learning_rate": 6.026312439675551e-08, | |
| "loss": 0.0002, | |
| "reward": 2.7334723472595215, | |
| "reward_std": 0.9248130321502686, | |
| "rewards/equation_reward_func": 0.625, | |
| "rewards/format_reward_func": 2.1084723472595215, | |
| "step": 510 | |
| }, | |
| { | |
| "completion_length": 648.875, | |
| "epoch": 0.10440456769983687, | |
| "grad_norm": 1.1533792615058107, | |
| "kl": 0.18505859375, | |
| "learning_rate": 5.7666677843977053e-08, | |
| "loss": 0.0002, | |
| "reward": 2.86263906955719, | |
| "reward_std": 0.8505788147449493, | |
| "rewards/equation_reward_func": 0.75, | |
| "rewards/format_reward_func": 2.1126389503479004, | |
| "step": 512 | |
| }, | |
| { | |
| "completion_length": 619.9166870117188, | |
| "epoch": 0.10481239804241435, | |
| "grad_norm": 1.1528844064024761, | |
| "kl": 0.18896484375, | |
| "learning_rate": 5.5123978421978464e-08, | |
| "loss": 0.0002, | |
| "reward": 3.175590395927429, | |
| "reward_std": 0.8909508585929871, | |
| "rewards/equation_reward_func": 1.1250000298023224, | |
| "rewards/format_reward_func": 2.050590455532074, | |
| "step": 514 | |
| }, | |
| { | |
| "completion_length": 652.8541870117188, | |
| "epoch": 0.10522022838499184, | |
| "grad_norm": 1.026362576623631, | |
| "kl": 0.18408203125, | |
| "learning_rate": 5.263533508961826e-08, | |
| "loss": 0.0002, | |
| "reward": 2.9189236164093018, | |
| "reward_std": 0.7488152384757996, | |
| "rewards/equation_reward_func": 0.7083333730697632, | |
| "rewards/format_reward_func": 2.210590362548828, | |
| "step": 516 | |
| }, | |
| { | |
| "completion_length": 659.4583435058594, | |
| "epoch": 0.10562805872756934, | |
| "grad_norm": 1.3341872745914232, | |
| "kl": 0.185546875, | |
| "learning_rate": 5.0201050237496435e-08, | |
| "loss": 0.0002, | |
| "reward": 2.915416717529297, | |
| "reward_std": 0.785500556230545, | |
| "rewards/equation_reward_func": 0.7916666865348816, | |
| "rewards/format_reward_func": 2.12375009059906, | |
| "step": 518 | |
| }, | |
| { | |
| "completion_length": 724.3333435058594, | |
| "epoch": 0.10603588907014681, | |
| "grad_norm": 1.1389027986321225, | |
| "kl": 0.16650390625, | |
| "learning_rate": 4.7821419651211284e-08, | |
| "loss": 0.0002, | |
| "reward": 2.604514002799988, | |
| "reward_std": 0.9652212858200073, | |
| "rewards/equation_reward_func": 0.4166666865348816, | |
| "rewards/format_reward_func": 2.187847375869751, | |
| "step": 520 | |
| }, | |
| { | |
| "completion_length": 616.1666717529297, | |
| "epoch": 0.1064437194127243, | |
| "grad_norm": 1.6950481305726997, | |
| "kl": 0.17822265625, | |
| "learning_rate": 4.549673247541874e-08, | |
| "loss": 0.0002, | |
| "reward": 2.983611226081848, | |
| "reward_std": 0.5816805064678192, | |
| "rewards/equation_reward_func": 0.75, | |
| "rewards/format_reward_func": 2.233611226081848, | |
| "step": 522 | |
| }, | |
| { | |
| "completion_length": 786.8541870117188, | |
| "epoch": 0.1068515497553018, | |
| "grad_norm": 0.9868783610320885, | |
| "kl": 0.18701171875, | |
| "learning_rate": 4.322727117869951e-08, | |
| "loss": 0.0002, | |
| "reward": 2.284409761428833, | |
| "reward_std": 0.6448712944984436, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 2.284409761428833, | |
| "step": 524 | |
| }, | |
| { | |
| "completion_length": 675.7291870117188, | |
| "epoch": 0.10725938009787928, | |
| "grad_norm": 1.5648006375954522, | |
| "kl": 0.1806640625, | |
| "learning_rate": 4.1013311519236485e-08, | |
| "loss": 0.0002, | |
| "reward": 2.776354193687439, | |
| "reward_std": 0.9456824660301208, | |
| "rewards/equation_reward_func": 0.7083333730697632, | |
| "rewards/format_reward_func": 2.0680209398269653, | |
| "step": 526 | |
| }, | |
| { | |
| "completion_length": 667.4375305175781, | |
| "epoch": 0.10766721044045677, | |
| "grad_norm": 1.53346953576534, | |
| "kl": 0.1943359375, | |
| "learning_rate": 3.8855122511307626e-08, | |
| "loss": 0.0002, | |
| "reward": 2.9528820514678955, | |
| "reward_std": 0.8039775192737579, | |
| "rewards/equation_reward_func": 0.7500000409781933, | |
| "rewards/format_reward_func": 2.2028820514678955, | |
| "step": 528 | |
| }, | |
| { | |
| "completion_length": 630.2083740234375, | |
| "epoch": 0.10807504078303426, | |
| "grad_norm": 1.5015454500255272, | |
| "kl": 0.18505859375, | |
| "learning_rate": 3.6752966392599117e-08, | |
| "loss": 0.0002, | |
| "reward": 2.824340343475342, | |
| "reward_std": 0.8222399055957794, | |
| "rewards/equation_reward_func": 0.5833333432674408, | |
| "rewards/format_reward_func": 2.2410069704055786, | |
| "step": 530 | |
| }, | |
| { | |
| "completion_length": 794.3541870117188, | |
| "epoch": 0.10848287112561175, | |
| "grad_norm": 1.0374860320295616, | |
| "kl": 0.17626953125, | |
| "learning_rate": 3.470709859234083e-08, | |
| "loss": 0.0002, | |
| "reward": 2.1905903816223145, | |
| "reward_std": 0.7955919802188873, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 2.1905903816223145, | |
| "step": 532 | |
| }, | |
| { | |
| "completion_length": 774.3541870117188, | |
| "epoch": 0.10889070146818923, | |
| "grad_norm": 1.120736957546931, | |
| "kl": 0.2001953125, | |
| "learning_rate": 3.271776770026963e-08, | |
| "loss": 0.0002, | |
| "reward": 2.2923611402511597, | |
| "reward_std": 0.7566681504249573, | |
| "rewards/equation_reward_func": 0.0416666679084301, | |
| "rewards/format_reward_func": 2.250694513320923, | |
| "step": 534 | |
| }, | |
| { | |
| "completion_length": 631.375, | |
| "epoch": 0.10929853181076672, | |
| "grad_norm": 1.1960849785247343, | |
| "kl": 0.20947265625, | |
| "learning_rate": 3.0785215436423985e-08, | |
| "loss": 0.0002, | |
| "reward": 3.0184723138809204, | |
| "reward_std": 0.9326076507568359, | |
| "rewards/equation_reward_func": 1.0, | |
| "rewards/format_reward_func": 2.0184723138809204, | |
| "step": 536 | |
| }, | |
| { | |
| "completion_length": 697.6041870117188, | |
| "epoch": 0.10970636215334421, | |
| "grad_norm": 1.3324134907829273, | |
| "kl": 0.19091796875, | |
| "learning_rate": 2.8909676621772848e-08, | |
| "loss": 0.0002, | |
| "reward": 2.5407986640930176, | |
| "reward_std": 0.8522857427597046, | |
| "rewards/equation_reward_func": 0.3333333432674408, | |
| "rewards/format_reward_func": 2.2074652910232544, | |
| "step": 538 | |
| }, | |
| { | |
| "completion_length": 661.75, | |
| "epoch": 0.11011419249592169, | |
| "grad_norm": 1.4930494245585781, | |
| "kl": 0.18701171875, | |
| "learning_rate": 2.7091379149682682e-08, | |
| "loss": 0.0002, | |
| "reward": 2.877777934074402, | |
| "reward_std": 0.41776843182742596, | |
| "rewards/equation_reward_func": 0.5416666865348816, | |
| "rewards/format_reward_func": 2.3361111879348755, | |
| "step": 540 | |
| }, | |
| { | |
| "completion_length": 503.8958435058594, | |
| "epoch": 0.11052202283849918, | |
| "grad_norm": 1.4985848888652291, | |
| "kl": 0.310546875, | |
| "learning_rate": 2.5330543958227035e-08, | |
| "loss": 0.0003, | |
| "reward": 3.7885764837265015, | |
| "reward_std": 0.5873951315879822, | |
| "rewards/equation_reward_func": 1.7083333730697632, | |
| "rewards/format_reward_func": 2.0802430510520935, | |
| "step": 542 | |
| }, | |
| { | |
| "completion_length": 812.0416870117188, | |
| "epoch": 0.11092985318107668, | |
| "grad_norm": 1.0755426609459213, | |
| "kl": 0.16943359375, | |
| "learning_rate": 2.362738500334055e-08, | |
| "loss": 0.0002, | |
| "reward": 2.2252084016799927, | |
| "reward_std": 0.7724728882312775, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 2.2252084016799927, | |
| "step": 544 | |
| }, | |
| { | |
| "completion_length": 589.3541717529297, | |
| "epoch": 0.11133768352365415, | |
| "grad_norm": 1.2868495738115369, | |
| "kl": 0.154296875, | |
| "learning_rate": 2.1982109232821176e-08, | |
| "loss": 0.0002, | |
| "reward": 3.426076292991638, | |
| "reward_std": 0.8766676485538483, | |
| "rewards/equation_reward_func": 1.2083333432674408, | |
| "rewards/format_reward_func": 2.217743158340454, | |
| "step": 546 | |
| }, | |
| { | |
| "completion_length": 697.1041870117188, | |
| "epoch": 0.11174551386623165, | |
| "grad_norm": 1.5843230276459015, | |
| "kl": 0.17822265625, | |
| "learning_rate": 2.0394916561185084e-08, | |
| "loss": 0.0002, | |
| "reward": 2.429861068725586, | |
| "reward_std": 0.6437118351459503, | |
| "rewards/equation_reward_func": 0.1666666716337204, | |
| "rewards/format_reward_func": 2.263194441795349, | |
| "step": 548 | |
| }, | |
| { | |
| "completion_length": 620.0416870117188, | |
| "epoch": 0.11215334420880914, | |
| "grad_norm": 1.43390485144146, | |
| "kl": 0.2021484375, | |
| "learning_rate": 1.8865999845374792e-08, | |
| "loss": 0.0002, | |
| "reward": 2.9577430486679077, | |
| "reward_std": 0.7144142985343933, | |
| "rewards/equation_reward_func": 0.9166666865348816, | |
| "rewards/format_reward_func": 2.0410765409469604, | |
| "step": 550 | |
| }, | |
| { | |
| "completion_length": 591.4375, | |
| "epoch": 0.11256117455138662, | |
| "grad_norm": 1.772375367915741, | |
| "kl": 0.26416015625, | |
| "learning_rate": 1.7395544861325718e-08, | |
| "loss": 0.0003, | |
| "reward": 3.147847294807434, | |
| "reward_std": 0.8900530934333801, | |
| "rewards/equation_reward_func": 0.958333358168602, | |
| "rewards/format_reward_func": 2.189513921737671, | |
| "step": 552 | |
| }, | |
| { | |
| "completion_length": 644.6666717529297, | |
| "epoch": 0.11296900489396411, | |
| "grad_norm": 1.0011090616007174, | |
| "kl": 0.15673828125, | |
| "learning_rate": 1.598373028139266e-08, | |
| "loss": 0.0002, | |
| "reward": 2.8859028816223145, | |
| "reward_std": 0.8658215999603271, | |
| "rewards/equation_reward_func": 0.75, | |
| "rewards/format_reward_func": 2.1359028220176697, | |
| "step": 554 | |
| }, | |
| { | |
| "completion_length": 802.8541870117188, | |
| "epoch": 0.1133768352365416, | |
| "grad_norm": 1.0859783026177827, | |
| "kl": 0.1611328125, | |
| "learning_rate": 1.4630727652640007e-08, | |
| "loss": 0.0002, | |
| "reward": 2.3715277910232544, | |
| "reward_std": 0.9544045031070709, | |
| "rewards/equation_reward_func": 0.1666666716337204, | |
| "rewards/format_reward_func": 2.204861283302307, | |
| "step": 556 | |
| }, | |
| { | |
| "completion_length": 653.8958435058594, | |
| "epoch": 0.1137846655791191, | |
| "grad_norm": 1.324673649956278, | |
| "kl": 0.189453125, | |
| "learning_rate": 1.3336701375997127e-08, | |
| "loss": 0.0002, | |
| "reward": 3.0635764598846436, | |
| "reward_std": 0.6987862586975098, | |
| "rewards/equation_reward_func": 0.75, | |
| "rewards/format_reward_func": 2.313576579093933, | |
| "step": 558 | |
| }, | |
| { | |
| "completion_length": 615.7708435058594, | |
| "epoch": 0.11419249592169657, | |
| "grad_norm": 1.9720955384622545, | |
| "kl": 0.1845703125, | |
| "learning_rate": 1.2101808686282189e-08, | |
| "loss": 0.0002, | |
| "reward": 2.958263874053955, | |
| "reward_std": 0.5063729882240295, | |
| "rewards/equation_reward_func": 0.7083333730697632, | |
| "rewards/format_reward_func": 2.2499306201934814, | |
| "step": 560 | |
| }, | |
| { | |
| "completion_length": 644.4166870117188, | |
| "epoch": 0.11460032626427406, | |
| "grad_norm": 0.9932246904810618, | |
| "kl": 0.1787109375, | |
| "learning_rate": 1.0926199633097154e-08, | |
| "loss": 0.0002, | |
| "reward": 2.9698264598846436, | |
| "reward_std": 0.7344387173652649, | |
| "rewards/equation_reward_func": 0.7916666865348816, | |
| "rewards/format_reward_func": 2.1781598329544067, | |
| "step": 562 | |
| }, | |
| { | |
| "completion_length": 647.9166870117188, | |
| "epoch": 0.11500815660685156, | |
| "grad_norm": 1.3817710664807803, | |
| "kl": 0.1806640625, | |
| "learning_rate": 9.810017062595321e-09, | |
| "loss": 0.0002, | |
| "reward": 2.9287848472595215, | |
| "reward_std": 0.8746606707572937, | |
| "rewards/equation_reward_func": 0.75, | |
| "rewards/format_reward_func": 2.1787847876548767, | |
| "step": 564 | |
| }, | |
| { | |
| "completion_length": 471.43751525878906, | |
| "epoch": 0.11541598694942903, | |
| "grad_norm": 1.3352359323731087, | |
| "kl": 0.19482421875, | |
| "learning_rate": 8.753396600124252e-09, | |
| "loss": 0.0002, | |
| "reward": 3.6786112785339355, | |
| "reward_std": 0.8317141830921173, | |
| "rewards/equation_reward_func": 1.7083333730697632, | |
| "rewards/format_reward_func": 1.9702778458595276, | |
| "step": 566 | |
| }, | |
| { | |
| "completion_length": 744.6041870117188, | |
| "epoch": 0.11582381729200653, | |
| "grad_norm": 1.4277278138028093, | |
| "kl": 0.1787109375, | |
| "learning_rate": 7.756466633746406e-09, | |
| "loss": 0.0002, | |
| "reward": 2.5164932012557983, | |
| "reward_std": 0.5315538048744202, | |
| "rewards/equation_reward_func": 0.125, | |
| "rewards/format_reward_func": 2.391493082046509, | |
| "step": 568 | |
| }, | |
| { | |
| "completion_length": 658.3333435058594, | |
| "epoch": 0.11623164763458402, | |
| "grad_norm": 1.1591152772878843, | |
| "kl": 0.189453125, | |
| "learning_rate": 6.819348298638839e-09, | |
| "loss": 0.0002, | |
| "reward": 2.8295485973358154, | |
| "reward_std": 0.7308537364006042, | |
| "rewards/equation_reward_func": 0.5833333730697632, | |
| "rewards/format_reward_func": 2.246215343475342, | |
| "step": 570 | |
| }, | |
| { | |
| "completion_length": 578.1875305175781, | |
| "epoch": 0.1166394779771615, | |
| "grad_norm": 1.4699911779642927, | |
| "kl": 0.21044921875, | |
| "learning_rate": 5.942155462374199e-09, | |
| "loss": 0.0002, | |
| "reward": 3.107847213745117, | |
| "reward_std": 0.4525897800922394, | |
| "rewards/equation_reward_func": 0.9583333730697632, | |
| "rewards/format_reward_func": 2.149513900279999, | |
| "step": 572 | |
| }, | |
| { | |
| "completion_length": 575.4375305175781, | |
| "epoch": 0.11704730831973899, | |
| "grad_norm": 1.4198721859673429, | |
| "kl": 0.19775390625, | |
| "learning_rate": 5.1249947110849626e-09, | |
| "loss": 0.0002, | |
| "reward": 3.4872570037841797, | |
| "reward_std": 0.8599075376987457, | |
| "rewards/equation_reward_func": 1.2916666865348816, | |
| "rewards/format_reward_func": 2.195590376853943, | |
| "step": 574 | |
| }, | |
| { | |
| "completion_length": 705.6041870117188, | |
| "epoch": 0.11745513866231648, | |
| "grad_norm": 1.4989903624292338, | |
| "kl": 0.1923828125, | |
| "learning_rate": 4.367965336512403e-09, | |
| "loss": 0.0002, | |
| "reward": 2.398923635482788, | |
| "reward_std": 0.3067256808280945, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 2.398923873901367, | |
| "step": 576 | |
| }, | |
| { | |
| "completion_length": 755.6875305175781, | |
| "epoch": 0.11786296900489396, | |
| "grad_norm": 1.1452197538999578, | |
| "kl": 0.1962890625, | |
| "learning_rate": 3.671159323941797e-09, | |
| "loss": 0.0002, | |
| "reward": 2.24670147895813, | |
| "reward_std": 0.6733859181404114, | |
| "rewards/equation_reward_func": 0.0, | |
| "rewards/format_reward_func": 2.24670147895813, | |
| "step": 578 | |
| }, | |
| { | |
| "completion_length": 664.6666870117188, | |
| "epoch": 0.11827079934747145, | |
| "grad_norm": 1.2449058428646427, | |
| "kl": 0.197265625, | |
| "learning_rate": 3.0346613410252574e-09, | |
| "loss": 0.0002, | |
| "reward": 3.29194438457489, | |
| "reward_std": 0.9043855667114258, | |
| "rewards/equation_reward_func": 1.0, | |
| "rewards/format_reward_func": 2.29194438457489, | |
| "step": 580 | |
| }, | |
| { | |
| "completion_length": 617.7291717529297, | |
| "epoch": 0.11867862969004894, | |
| "grad_norm": 1.6256483463504106, | |
| "kl": 0.23583984375, | |
| "learning_rate": 2.458548727494292e-09, | |
| "loss": 0.0002, | |
| "reward": 3.077187657356262, | |
| "reward_std": 0.5232462398707867, | |
| "rewards/equation_reward_func": 0.875, | |
| "rewards/format_reward_func": 2.2021875381469727, | |
| "step": 582 | |
| }, | |
| { | |
| "completion_length": 562.9791870117188, | |
| "epoch": 0.11908646003262642, | |
| "grad_norm": 1.3493720634814992, | |
| "kl": 0.20654296875, | |
| "learning_rate": 1.942891485762044e-09, | |
| "loss": 0.0002, | |
| "reward": 3.354375123977661, | |
| "reward_std": 0.5960628092288971, | |
| "rewards/equation_reward_func": 1.1666666865348816, | |
| "rewards/format_reward_func": 2.1877083778381348, | |
| "step": 584 | |
| }, | |
| { | |
| "completion_length": 643.0625305175781, | |
| "epoch": 0.11949429037520391, | |
| "grad_norm": 1.4429671637976917, | |
| "kl": 0.1787109375, | |
| "learning_rate": 1.4877522724175972e-09, | |
| "loss": 0.0002, | |
| "reward": 2.989027738571167, | |
| "reward_std": 1.0448077917099, | |
| "rewards/equation_reward_func": 0.8333333730697632, | |
| "rewards/format_reward_func": 2.1556944847106934, | |
| "step": 586 | |
| }, | |
| { | |
| "completion_length": 545.5208435058594, | |
| "epoch": 0.1199021207177814, | |
| "grad_norm": 1.3733935240774355, | |
| "kl": 0.18994140625, | |
| "learning_rate": 1.0931863906127325e-09, | |
| "loss": 0.0002, | |
| "reward": 3.299618124961853, | |
| "reward_std": 1.0864940881729126, | |
| "rewards/equation_reward_func": 1.1250000596046448, | |
| "rewards/format_reward_func": 2.174618124961853, | |
| "step": 588 | |
| }, | |
| { | |
| "completion_length": 597.1666870117188, | |
| "epoch": 0.1203099510603589, | |
| "grad_norm": 1.2952890180323302, | |
| "kl": 0.20654296875, | |
| "learning_rate": 7.592417833419129e-10, | |
| "loss": 0.0002, | |
| "reward": 3.2572569847106934, | |
| "reward_std": 0.8164662718772888, | |
| "rewards/equation_reward_func": 1.0833333730697632, | |
| "rewards/format_reward_func": 2.1739237308502197, | |
| "step": 590 | |
| }, | |
| { | |
| "completion_length": 557.7083435058594, | |
| "epoch": 0.12071778140293637, | |
| "grad_norm": 1.419396729242162, | |
| "kl": 0.18310546875, | |
| "learning_rate": 4.859590276170556e-10, | |
| "loss": 0.0002, | |
| "reward": 3.1976042985916138, | |
| "reward_std": 0.1676994524896145, | |
| "rewards/equation_reward_func": 1.0, | |
| "rewards/format_reward_func": 2.197604179382324, | |
| "step": 592 | |
| }, | |
| { | |
| "completion_length": 640.6666870117188, | |
| "epoch": 0.12112561174551387, | |
| "grad_norm": 1.7175944027716497, | |
| "kl": 0.21728515625, | |
| "learning_rate": 2.733713295369755e-10, | |
| "loss": 0.0002, | |
| "reward": 2.8080209493637085, | |
| "reward_std": 0.7391078174114227, | |
| "rewards/equation_reward_func": 0.5416666865348816, | |
| "rewards/format_reward_func": 2.266354203224182, | |
| "step": 594 | |
| }, | |
| { | |
| "completion_length": 658.6458435058594, | |
| "epoch": 0.12153344208809136, | |
| "grad_norm": 1.1130542587727132, | |
| "kl": 0.17919921875, | |
| "learning_rate": 1.215045202527243e-10, | |
| "loss": 0.0002, | |
| "reward": 2.9176390171051025, | |
| "reward_std": 0.8149993717670441, | |
| "rewards/equation_reward_func": 0.6666666865348816, | |
| "rewards/format_reward_func": 2.250972270965576, | |
| "step": 596 | |
| }, | |
| { | |
| "completion_length": 666.5416870117188, | |
| "epoch": 0.12194127243066884, | |
| "grad_norm": 1.3351873357408293, | |
| "kl": 0.1982421875, | |
| "learning_rate": 3.037705282848968e-11, | |
| "loss": 0.0002, | |
| "reward": 2.8039932250976562, | |
| "reward_std": 0.471679862588644, | |
| "rewards/equation_reward_func": 0.4583333432674408, | |
| "rewards/format_reward_func": 2.345659852027893, | |
| "step": 598 | |
| }, | |
| { | |
| "completion_length": 642.3125305175781, | |
| "epoch": 0.12234910277324633, | |
| "grad_norm": 1.3356459953636426, | |
| "kl": 0.21044921875, | |
| "learning_rate": 0.0, | |
| "loss": 0.0002, | |
| "reward": 2.9183679819107056, | |
| "reward_std": 0.771289050579071, | |
| "rewards/equation_reward_func": 0.75, | |
| "rewards/format_reward_func": 2.168368101119995, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.12234910277324633, | |
| "step": 600, | |
| "total_flos": 0.0, | |
| "train_loss": 0.00014642298419068685, | |
| "train_runtime": 10716.9446, | |
| "train_samples_per_second": 1.344, | |
| "train_steps_per_second": 0.056 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 600, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |