{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1376367765466933, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 345.28125, "epoch": 6.881838827334664e-05, "grad_norm": 1.9427260429862132, "kl": 0.0, "learning_rate": 9.999999883144609e-07, "loss": -0.0, "reward": 1.6921255588531494, "reward_std": 0.24480973184108734, "rewards/accuracy_reward": 0.5358754396438599, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.078125, "step": 1, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 347.125, "epoch": 0.00013763677654669328, "grad_norm": 5.4242153958980515, "kl": 0.0004100799560546875, "learning_rate": 9.999999532578448e-07, "loss": -0.0, "reward": 1.969768762588501, "reward_std": 0.36097246408462524, "rewards/accuracy_reward": 0.7416438460350037, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.125, "step": 2, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 285.40625, "epoch": 0.00020645516482003991, "grad_norm": 1.918922545329286, "kl": 0.000396728515625, "learning_rate": 9.999998948301528e-07, "loss": 0.0, "reward": 2.023510694503784, "reward_std": 0.31704291701316833, "rewards/accuracy_reward": 0.7344481945037842, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.2421875, "step": 3, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 359.921875, "epoch": 0.00027527355309338655, "grad_norm": 1.7629347280175651, "kl": 0.000514984130859375, "learning_rate": 9.999998130313879e-07, "loss": 0.0, "reward": 1.8883352279663086, "reward_std": 0.4018746614456177, "rewards/accuracy_reward": 0.6789602041244507, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.109375, "step": 4, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 341.359375, "epoch": 0.0003440919413667332, "grad_norm": 50.32920833678885, "kl": 0.00052642822265625, "learning_rate": 9.99999707861554e-07, "loss": 0.0, "reward": 1.9734374284744263, "reward_std": 0.27412092685699463, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0859375, "step": 5, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 245.15625, "epoch": 0.00041291032964007983, "grad_norm": 2.1083418224811674, "kl": 0.000507354736328125, "learning_rate": 9.999995793206556e-07, "loss": -0.0, "reward": 2.028125047683716, "reward_std": 0.3224116563796997, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.234375, "step": 6, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 262.21875, "epoch": 0.00048172871791342647, "grad_norm": 8.241569665306294, "kl": 0.000823974609375, "learning_rate": 9.999994274086992e-07, "loss": -0.0, "reward": 1.8781249523162842, "reward_std": 0.34757232666015625, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.265625, "step": 7, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 249.515625, "epoch": 0.0005505471061867731, "grad_norm": 2.1021647193552333, "kl": 0.0006866455078125, "learning_rate": 9.999992521256916e-07, "loss": 0.0, "reward": 2.113405704498291, "reward_std": 0.21761155128479004, "rewards/accuracy_reward": 0.6571558117866516, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.453125, "step": 8, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 360.578125, "epoch": 0.0006193654944601197, "grad_norm": 1.9858073494288275, "kl": 0.00125885009765625, "learning_rate": 9.99999053471641e-07, "loss": 0.0, "reward": 2.0734376907348633, "reward_std": 0.2930377721786499, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0703125, "step": 9, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 390.234375, "epoch": 0.0006881838827334664, "grad_norm": 37.995645608962185, "kl": 0.0015716552734375, "learning_rate": 9.999988314465569e-07, "loss": 0.0, "reward": 2.1515626907348633, "reward_std": 0.26634812355041504, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1015625, "step": 10, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 289.890625, "epoch": 0.000757002271006813, "grad_norm": 4.010114610509881, "kl": 0.00138092041015625, "learning_rate": 9.999985860504496e-07, "loss": 0.0, "reward": 2.0753254890441895, "reward_std": 0.30335235595703125, "rewards/accuracy_reward": 0.7597004175186157, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.265625, "step": 11, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 241.0, "epoch": 0.0008258206592801597, "grad_norm": 1.9073416352834758, "kl": 0.00201416015625, "learning_rate": 9.999983172833305e-07, "loss": 0.0, "reward": 2.0081605911254883, "reward_std": 0.34683507680892944, "rewards/accuracy_reward": 0.6925355195999146, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3125, "step": 12, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 381.5, "epoch": 0.0008946390475535063, "grad_norm": 2.028530198040944, "kl": 0.00286865234375, "learning_rate": 9.99998025145212e-07, "loss": -0.0, "reward": 1.8599125146865845, "reward_std": 0.36836230754852295, "rewards/accuracy_reward": 0.7302249670028687, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0546875, "step": 13, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 264.125, "epoch": 0.0009634574358268529, "grad_norm": 1.8841747671670988, "kl": 0.0022735595703125, "learning_rate": 9.99997709636108e-07, "loss": 0.0, "reward": 2.076272964477539, "reward_std": 0.2896541357040405, "rewards/accuracy_reward": 0.8434605598449707, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.2109375, "step": 14, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 276.890625, "epoch": 0.0010322758241001996, "grad_norm": 5.173125163131311, "kl": 0.0031585693359375, "learning_rate": 9.999973707560334e-07, "loss": 0.0, "reward": 2.0123777389526367, "reward_std": 0.3146446645259857, "rewards/accuracy_reward": 0.8545652627944946, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1328125, "step": 15, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 258.796875, "epoch": 0.0011010942123735462, "grad_norm": 1.6062734240028482, "kl": 0.0032806396484375, "learning_rate": 9.999970085050035e-07, "loss": 0.0, "reward": 1.857812523841858, "reward_std": 0.32166948914527893, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1953125, "step": 16, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 243.859375, "epoch": 0.0011699126006468928, "grad_norm": 2.0471454124929047, "kl": 0.0034027099609375, "learning_rate": 9.999966228830359e-07, "loss": 0.0, "reward": 2.1578125953674316, "reward_std": 0.3396627604961395, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.2890625, "step": 17, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 303.921875, "epoch": 0.0012387309889202395, "grad_norm": 1.7323252393361859, "kl": 0.00494384765625, "learning_rate": 9.99996213890148e-07, "loss": 0.0, "reward": 1.9950180053710938, "reward_std": 0.2784517705440521, "rewards/accuracy_reward": 0.8247054815292358, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1171875, "step": 18, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 378.53125, "epoch": 0.0013075493771935861, "grad_norm": 1.730970849891431, "kl": 0.005340576171875, "learning_rate": 9.999957815263594e-07, "loss": 0.0, "reward": 1.9453125, "reward_std": 0.28793865442276, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0234375, "step": 19, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 285.0, "epoch": 0.0013763677654669328, "grad_norm": 1.0862409019021797, "kl": 0.00543212890625, "learning_rate": 9.9999532579169e-07, "loss": 0.0, "reward": 2.0459396839141846, "reward_std": 0.1507713943719864, "rewards/accuracy_reward": 0.6350022554397583, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3828125, "step": 20, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 423.125, "epoch": 0.0014451861537402794, "grad_norm": 1.6456170589844838, "kl": 0.005584716796875, "learning_rate": 9.999948466861612e-07, "loss": -0.0, "reward": 2.1156251430511475, "reward_std": 0.38427597284317017, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.078125, "step": 21, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 276.171875, "epoch": 0.001514004542013626, "grad_norm": 2.3479162666886926, "kl": 0.00579833984375, "learning_rate": 9.999943442097954e-07, "loss": 0.0, "reward": 2.2752652168273926, "reward_std": 0.14420877397060394, "rewards/accuracy_reward": 0.7955776453018188, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4453125, "step": 22, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 265.234375, "epoch": 0.0015828229302869727, "grad_norm": 2.7363179293349615, "kl": 0.007598876953125, "learning_rate": 9.999938183626163e-07, "loss": 0.0, "reward": 2.1343750953674316, "reward_std": 0.39889228343963623, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3125, "step": 23, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 243.765625, "epoch": 0.0016516413185603193, "grad_norm": 3.1943980375247714, "kl": 0.00732421875, "learning_rate": 9.99993269144648e-07, "loss": -0.0, "reward": 2.0703125, "reward_std": 0.3398473560810089, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3046875, "step": 24, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 262.015625, "epoch": 0.001720459706833666, "grad_norm": 1.3626767277694383, "kl": 0.00701904296875, "learning_rate": 9.999926965559165e-07, "loss": -0.0, "reward": 2.200932741165161, "reward_std": 0.15440574288368225, "rewards/accuracy_reward": 0.721245288848877, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4609375, "step": 25, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 298.734375, "epoch": 0.0017892780951070126, "grad_norm": 1.711145199551333, "kl": 0.006622314453125, "learning_rate": 9.999921005964485e-07, "loss": -0.0, "reward": 2.029729127883911, "reward_std": 0.39789724349975586, "rewards/accuracy_reward": 0.6437916159629822, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3515625, "step": 26, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 384.21875, "epoch": 0.0018580964833803592, "grad_norm": 1.6841856786400378, "kl": 0.01019287109375, "learning_rate": 9.999914812662719e-07, "loss": 0.0, "reward": 2.0531249046325684, "reward_std": 0.2703586220741272, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.125, "step": 27, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 258.40625, "epoch": 0.0019269148716537059, "grad_norm": 3.007041221271789, "kl": 0.0091552734375, "learning_rate": 9.999908385654156e-07, "loss": -0.0, "reward": 2.078125, "reward_std": 0.34065574407577515, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.234375, "step": 28, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 423.59375, "epoch": 0.0019957332599270525, "grad_norm": 1.9355178287681143, "kl": 0.008544921875, "learning_rate": 9.999901724939097e-07, "loss": 0.0, "reward": 2.234375, "reward_std": 0.24634823203086853, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.09375, "step": 29, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 415.984375, "epoch": 0.002064551648200399, "grad_norm": 1.4152925753735537, "kl": 0.009765625, "learning_rate": 9.999894830517853e-07, "loss": 0.0, "reward": 1.848057746887207, "reward_std": 0.3486965298652649, "rewards/accuracy_reward": 0.6105576753616333, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.109375, "step": 30, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 423.859375, "epoch": 0.0021333700364737458, "grad_norm": 1.7951633577742152, "kl": 0.01092529296875, "learning_rate": 9.999887702390745e-07, "loss": 0.0, "reward": 1.9729365110397339, "reward_std": 0.29504337906837463, "rewards/accuracy_reward": 0.7338740229606628, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1015625, "step": 31, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 445.34375, "epoch": 0.0022021884247470924, "grad_norm": 1.3540356295774907, "kl": 0.01116943359375, "learning_rate": 9.99988034055811e-07, "loss": -0.0, "reward": 1.7292635440826416, "reward_std": 0.3386380672454834, "rewards/accuracy_reward": 0.5542635917663574, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.046875, "step": 32, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 305.8125, "epoch": 0.002271006813020439, "grad_norm": 4.106575376430855, "kl": 0.0108642578125, "learning_rate": 9.999872745020285e-07, "loss": -0.0, "reward": 1.9932403564453125, "reward_std": 0.26477575302124023, "rewards/accuracy_reward": 0.5744904279708862, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 33, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 421.21875, "epoch": 0.0023398252012937857, "grad_norm": 2.8184656548897253, "kl": 0.0125732421875, "learning_rate": 9.999864915777633e-07, "loss": 0.0, "reward": 1.9127222299575806, "reward_std": 0.41250473260879517, "rewards/accuracy_reward": 0.7330347299575806, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0546875, "step": 34, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 290.515625, "epoch": 0.0024086435895671323, "grad_norm": 5.691600708359853, "kl": 0.0162353515625, "learning_rate": 9.999856852830515e-07, "loss": -0.0, "reward": 2.129687547683716, "reward_std": 0.3730224668979645, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1953125, "step": 35, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 294.53125, "epoch": 0.002477461977840479, "grad_norm": 2.2854427840968112, "kl": 0.01373291015625, "learning_rate": 9.999848556179308e-07, "loss": -0.0, "reward": 2.28125, "reward_std": 0.23983222246170044, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.296875, "step": 36, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 301.234375, "epoch": 0.0025462803661138256, "grad_norm": 6.079812547837426, "kl": 0.0169677734375, "learning_rate": 9.999840025824403e-07, "loss": -0.0, "reward": 2.3734374046325684, "reward_std": 0.2759913206100464, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3984375, "step": 37, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 306.984375, "epoch": 0.0026150987543871723, "grad_norm": 2.8060944831813646, "kl": 0.0247802734375, "learning_rate": 9.999831261766197e-07, "loss": -0.0, "reward": 2.28125, "reward_std": 0.3480733036994934, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.390625, "step": 38, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 308.390625, "epoch": 0.002683917142660519, "grad_norm": 3.145049640199668, "kl": 0.0166015625, "learning_rate": 9.999822264005099e-07, "loss": 0.0, "reward": 2.117889881134033, "reward_std": 0.29474613070487976, "rewards/accuracy_reward": 0.6460151672363281, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.421875, "step": 39, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 325.625, "epoch": 0.0027527355309338655, "grad_norm": 2.1557383407428063, "kl": 0.018310546875, "learning_rate": 9.999813032541529e-07, "loss": 0.0, "reward": 1.9237087965011597, "reward_std": 0.2858303487300873, "rewards/accuracy_reward": 0.7440212965011597, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1015625, "step": 40, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 333.921875, "epoch": 0.002821553919207212, "grad_norm": 2.4783835067749163, "kl": 0.0250244140625, "learning_rate": 9.999803567375921e-07, "loss": 0.0, "reward": 1.9630986452102661, "reward_std": 0.26151102781295776, "rewards/accuracy_reward": 0.7927861213684082, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0703125, "step": 41, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 485.546875, "epoch": 0.002890372307480559, "grad_norm": 1.9561287590441832, "kl": 0.017333984375, "learning_rate": 9.999793868508716e-07, "loss": 0.0, "reward": 1.7923319339752197, "reward_std": 0.299124538898468, "rewards/accuracy_reward": 0.629831850528717, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.078125, "step": 42, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 339.25, "epoch": 0.0029591906957539054, "grad_norm": 2.7235654515471968, "kl": 0.0277099609375, "learning_rate": 9.999783935940367e-07, "loss": 0.0, "reward": 2.135749340057373, "reward_std": 0.21930737793445587, "rewards/accuracy_reward": 0.6044992804527283, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.453125, "step": 43, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 514.78125, "epoch": 0.003028009084027252, "grad_norm": 1.841719817999831, "kl": 0.019287109375, "learning_rate": 9.999773769671338e-07, "loss": 0.0, "reward": 1.9942222833633423, "reward_std": 0.24283026158809662, "rewards/accuracy_reward": 0.7848473787307739, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.109375, "step": 44, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 481.078125, "epoch": 0.0030968274723005987, "grad_norm": 1.4205451731086696, "kl": 0.022705078125, "learning_rate": 9.999763369702105e-07, "loss": 0.0, "reward": 2.0328125953674316, "reward_std": 0.47232672572135925, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1015625, "step": 45, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 338.71875, "epoch": 0.0031656458605739454, "grad_norm": 2.9679427691606386, "kl": 0.0322265625, "learning_rate": 9.999752736033152e-07, "loss": -0.0, "reward": 2.2671875953674316, "reward_std": 0.3632131516933441, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4453125, "step": 46, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 359.53125, "epoch": 0.003234464248847292, "grad_norm": 4.504968521521287, "kl": 0.02783203125, "learning_rate": 9.99974186866498e-07, "loss": -0.0, "reward": 1.5634241104125977, "reward_std": 0.3219459652900696, "rewards/accuracy_reward": 0.41186168789863586, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0703125, "step": 47, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 308.875, "epoch": 0.0033032826371206386, "grad_norm": 1.5278557665539063, "kl": 0.0279541015625, "learning_rate": 9.999730767598096e-07, "loss": -0.0, "reward": 2.3734374046325684, "reward_std": 0.2076101303100586, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4765625, "step": 48, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 346.0, "epoch": 0.0033721010253939853, "grad_norm": 1.8564418500739688, "kl": 0.029296875, "learning_rate": 9.999719432833017e-07, "loss": -0.0, "reward": 2.2671875953674316, "reward_std": 0.3287111818790436, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3671875, "step": 49, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 493.90625, "epoch": 0.003440919413667332, "grad_norm": 1.1781566093573304, "kl": 0.02880859375, "learning_rate": 9.999707864370274e-07, "loss": 0.0, "reward": 2.09375, "reward_std": 0.33907192945480347, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0625, "step": 50, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 387.84375, "epoch": 0.0035097378019406785, "grad_norm": 2.049067381472899, "kl": 0.04638671875, "learning_rate": 9.999696062210405e-07, "loss": 0.0, "reward": 1.8420155048370361, "reward_std": 0.4262024164199829, "rewards/accuracy_reward": 0.6888905763626099, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.046875, "step": 51, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 375.375, "epoch": 0.003578556190214025, "grad_norm": 1.6001202974164124, "kl": 0.04248046875, "learning_rate": 9.999684026353966e-07, "loss": 0.0, "reward": 2.1491525173187256, "reward_std": 0.19908407330513, "rewards/accuracy_reward": 0.5413399934768677, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 52, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 375.015625, "epoch": 0.003647374578487372, "grad_norm": 7.016090783410042, "kl": 0.053955078125, "learning_rate": 9.999671756801516e-07, "loss": 0.0, "reward": 2.411604404449463, "reward_std": 0.24130891263484955, "rewards/accuracy_reward": 0.7866045832633972, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 53, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 506.46875, "epoch": 0.0037161929667607185, "grad_norm": 1.718761942727118, "kl": 0.03125, "learning_rate": 9.999659253553633e-07, "loss": 0.0, "reward": 2.212552547454834, "reward_std": 0.29444944858551025, "rewards/accuracy_reward": 0.8781775832176208, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.140625, "step": 54, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 524.515625, "epoch": 0.003785011355034065, "grad_norm": 1.6892387517718086, "kl": 0.0306396484375, "learning_rate": 9.999646516610895e-07, "loss": 0.0, "reward": 2.1078124046325684, "reward_std": 0.303860068321228, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1015625, "step": 55, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 537.984375, "epoch": 0.0038538297433074117, "grad_norm": 1.5456270321515537, "kl": 0.03369140625, "learning_rate": 9.999633545973903e-07, "loss": 0.0, "reward": 1.9281251430511475, "reward_std": 0.45354610681533813, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.078125, "step": 56, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 390.65625, "epoch": 0.003922648131580758, "grad_norm": 2.1311213865317944, "kl": 0.060791015625, "learning_rate": 9.99962034164326e-07, "loss": 0.0, "reward": 2.356250047683716, "reward_std": 0.2131771743297577, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.484375, "step": 57, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 396.515625, "epoch": 0.003991466519854105, "grad_norm": 3.3780798676013624, "kl": 0.055908203125, "learning_rate": 9.999606903619584e-07, "loss": 0.0, "reward": 2.11877179145813, "reward_std": 0.320317804813385, "rewards/accuracy_reward": 0.5062717795372009, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 58, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 363.3125, "epoch": 0.004060284908127452, "grad_norm": 1.9386961135788892, "kl": 0.03515625, "learning_rate": 9.999593231903504e-07, "loss": 0.0, "reward": 2.193404197692871, "reward_std": 0.26840758323669434, "rewards/accuracy_reward": 0.6559040546417236, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.421875, "step": 59, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 585.03125, "epoch": 0.004129103296400798, "grad_norm": 11.452306460570176, "kl": 0.035400390625, "learning_rate": 9.999579326495659e-07, "loss": -0.0, "reward": 1.7721657752990723, "reward_std": 0.3972410261631012, "rewards/accuracy_reward": 0.6362283229827881, "rewards/format_reward": 0.90625, "rewards/transform_reward": 0.1015625, "step": 60, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 398.265625, "epoch": 0.004197921684674145, "grad_norm": 1.4403804271125904, "kl": 0.044189453125, "learning_rate": 9.999565187396698e-07, "loss": 0.0, "reward": 1.8436307907104492, "reward_std": 0.35989266633987427, "rewards/accuracy_reward": 0.4217557907104492, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.328125, "step": 61, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 593.125, "epoch": 0.0042667400729474916, "grad_norm": 1.6984765278912006, "kl": 0.03662109375, "learning_rate": 9.999550814607283e-07, "loss": -0.0, "reward": 1.9702131748199463, "reward_std": 0.4202512502670288, "rewards/accuracy_reward": 0.849900484085083, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.0078125, "step": 62, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 430.40625, "epoch": 0.004335558461220838, "grad_norm": 1.1310355977962945, "kl": 0.0703125, "learning_rate": 9.999536208128086e-07, "loss": -0.0, "reward": 2.0187501907348633, "reward_std": 0.29926151037216187, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.03125, "step": 63, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 575.875, "epoch": 0.004404376849494185, "grad_norm": 1.6454928359592527, "kl": 0.0390625, "learning_rate": 9.999521367959786e-07, "loss": 0.0, "reward": 1.9937500953674316, "reward_std": 0.4808969497680664, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.125, "step": 64, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 416.40625, "epoch": 0.0044731952377675315, "grad_norm": 3.6481999848019058, "kl": 0.06591796875, "learning_rate": 9.999506294103084e-07, "loss": 0.0, "reward": 2.4953126907348633, "reward_std": 0.1857689917087555, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 65, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.296875, "epoch": 0.004542013626040878, "grad_norm": 1.5270628803759294, "kl": 0.048095703125, "learning_rate": 9.999490986558675e-07, "loss": 0.0, "reward": 2.2249999046325684, "reward_std": 0.14389485120773315, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.328125, "step": 66, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 440.96875, "epoch": 0.004610832014314225, "grad_norm": 2.3381324167414577, "kl": 0.07373046875, "learning_rate": 9.999475445327283e-07, "loss": 0.0, "reward": 2.6171875, "reward_std": 0.11368869245052338, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 67, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 401.703125, "epoch": 0.004679650402587571, "grad_norm": 1.6121579473442076, "kl": 0.0625, "learning_rate": 9.99945967040963e-07, "loss": 0.0, "reward": 2.4140625, "reward_std": 0.3468397855758667, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4453125, "step": 68, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 399.875, "epoch": 0.004748468790860918, "grad_norm": 1.3684247646779892, "kl": 0.05517578125, "learning_rate": 9.999443661806454e-07, "loss": 0.0, "reward": 2.1796875, "reward_std": 0.33817365765571594, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0859375, "step": 69, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 397.53125, "epoch": 0.004817287179134265, "grad_norm": 3.765760850287206, "kl": 0.06005859375, "learning_rate": 9.999427419518504e-07, "loss": 0.0, "reward": 2.3578124046325684, "reward_std": 0.28849348425865173, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3203125, "step": 70, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 407.609375, "epoch": 0.004886105567407611, "grad_norm": 2.0279404889037753, "kl": 0.06640625, "learning_rate": 9.999410943546541e-07, "loss": 0.0, "reward": 1.894338846206665, "reward_std": 0.3358474373817444, "rewards/accuracy_reward": 0.6771513819694519, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.1171875, "step": 71, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 660.578125, "epoch": 0.004954923955680958, "grad_norm": 1.3977478283906157, "kl": 0.040771484375, "learning_rate": 9.99939423389133e-07, "loss": -0.0, "reward": 1.576562523841858, "reward_std": 0.6099406480789185, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.8125, "rewards/transform_reward": 0.0703125, "step": 72, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 618.421875, "epoch": 0.005023742343954305, "grad_norm": 2.608826641960917, "kl": 0.046630859375, "learning_rate": 9.999377290553656e-07, "loss": 0.0, "reward": 1.9796876907348633, "reward_std": 0.6622117161750793, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.90625, "rewards/transform_reward": 0.1171875, "step": 73, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 647.796875, "epoch": 0.005092560732227651, "grad_norm": 2.1822028395353286, "kl": 0.04541015625, "learning_rate": 9.999360113534313e-07, "loss": 0.0, "reward": 1.5467082262039185, "reward_std": 0.46144580841064453, "rewards/accuracy_reward": 0.5388957262039185, "rewards/format_reward": 0.796875, "rewards/transform_reward": 0.0859375, "step": 74, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 494.84375, "epoch": 0.005161379120500998, "grad_norm": 1.4323092426150743, "kl": 0.06787109375, "learning_rate": 9.999342702834098e-07, "loss": 0.0, "reward": 2.515625, "reward_std": 0.29131996631622314, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.484375, "step": 75, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 633.890625, "epoch": 0.0052301975087743445, "grad_norm": 2.119126568337269, "kl": 0.0458984375, "learning_rate": 9.999325058453827e-07, "loss": -0.0, "reward": 1.759378433227539, "reward_std": 0.5327105522155762, "rewards/accuracy_reward": 0.6500034332275391, "rewards/format_reward": 0.875, "rewards/transform_reward": 0.078125, "step": 76, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 635.140625, "epoch": 0.005299015897047691, "grad_norm": 3.869943922009549, "kl": 0.04443359375, "learning_rate": 9.999307180394326e-07, "loss": 0.0, "reward": 1.8843750953674316, "reward_std": 0.6721916198730469, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.875, "rewards/transform_reward": 0.109375, "step": 77, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 430.5, "epoch": 0.005367834285321038, "grad_norm": 3.3080707343123112, "kl": 0.060791015625, "learning_rate": 9.99928906865643e-07, "loss": 0.0, "reward": 2.2288966178894043, "reward_std": 0.3335486054420471, "rewards/accuracy_reward": 0.6913965940475464, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.390625, "step": 78, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 420.984375, "epoch": 0.005436652673594384, "grad_norm": 1.483182801031034, "kl": 0.0634765625, "learning_rate": 9.999270723240988e-07, "loss": -0.0, "reward": 2.223437547683716, "reward_std": 0.477693647146225, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4453125, "step": 79, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 433.265625, "epoch": 0.005505471061867731, "grad_norm": 1.69224109582996, "kl": 0.06884765625, "learning_rate": 9.999252144148854e-07, "loss": 0.0, "reward": 2.5859375, "reward_std": 0.19718679785728455, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4296875, "step": 80, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 624.125, "epoch": 0.005574289450141078, "grad_norm": 2.480812181586453, "kl": 0.04931640625, "learning_rate": 9.999233331380897e-07, "loss": 0.0, "reward": 1.7671875953674316, "reward_std": 0.46417587995529175, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.84375, "rewards/transform_reward": 0.0234375, "step": 81, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 450.09375, "epoch": 0.005643107838414424, "grad_norm": 6.31178683044425, "kl": 0.07080078125, "learning_rate": 9.999214284937998e-07, "loss": 0.0, "reward": 2.0571305751800537, "reward_std": 0.41603025794029236, "rewards/accuracy_reward": 0.6290056705474854, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.375, "step": 82, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 417.078125, "epoch": 0.005711926226687771, "grad_norm": 1.7521305547187769, "kl": 0.08203125, "learning_rate": 9.999195004821044e-07, "loss": 0.0, "reward": 2.373626708984375, "reward_std": 0.315652072429657, "rewards/accuracy_reward": 0.8142515420913696, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.40625, "step": 83, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 442.90625, "epoch": 0.005780744614961118, "grad_norm": 1.0946806329884826, "kl": 0.0615234375, "learning_rate": 9.99917549103094e-07, "loss": -0.0, "reward": 2.171875, "reward_std": 0.19676993787288666, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.328125, "step": 84, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 450.265625, "epoch": 0.005849563003234464, "grad_norm": 3.6337366018259334, "kl": 0.062255859375, "learning_rate": 9.999155743568598e-07, "loss": 0.0, "reward": 2.262761116027832, "reward_std": 0.2631891965866089, "rewards/accuracy_reward": 0.7877610921859741, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.34375, "step": 85, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 463.265625, "epoch": 0.005918381391507811, "grad_norm": 1.2997373283015572, "kl": 0.09228515625, "learning_rate": 9.999135762434939e-07, "loss": 0.0, "reward": 1.984375, "reward_std": 0.2007448673248291, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 86, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 456.1875, "epoch": 0.0059871997797811575, "grad_norm": 2.3276266010285784, "kl": 0.07421875, "learning_rate": 9.999115547630896e-07, "loss": 0.0, "reward": 2.0441112518310547, "reward_std": 0.3298320174217224, "rewards/accuracy_reward": 0.5769237279891968, "rewards/format_reward": 0.921875, "rewards/transform_reward": 0.4296875, "step": 87, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 607.375, "epoch": 0.006056018168054504, "grad_norm": 1.160070822523263, "kl": 0.052734375, "learning_rate": 9.999095099157417e-07, "loss": 0.0, "reward": 1.84375, "reward_std": 0.41645777225494385, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.875, "rewards/transform_reward": 0.125, "step": 88, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 490.375, "epoch": 0.006124836556327851, "grad_norm": 1.1850628512675503, "kl": 0.0859375, "learning_rate": 9.999074417015457e-07, "loss": 0.0, "reward": 2.03125, "reward_std": 0.26370882987976074, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.015625, "step": 89, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 428.71875, "epoch": 0.0061936549446011974, "grad_norm": 2.3339856569543005, "kl": 0.08251953125, "learning_rate": 9.999053501205979e-07, "loss": 0.0, "reward": 2.4749999046325684, "reward_std": 0.2618870735168457, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 90, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 439.34375, "epoch": 0.006262473332874544, "grad_norm": 1.2868545618946532, "kl": 0.07568359375, "learning_rate": 9.999032351729967e-07, "loss": 0.0, "reward": 1.7296875715255737, "reward_std": 0.3227123022079468, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0546875, "step": 91, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 499.5625, "epoch": 0.006331291721147891, "grad_norm": 2.390235318209299, "kl": 0.0830078125, "learning_rate": 9.999010968588405e-07, "loss": 0.0, "reward": 2.1709914207458496, "reward_std": 0.3031667470932007, "rewards/accuracy_reward": 0.6647413969039917, "rewards/format_reward": 0.90625, "rewards/transform_reward": 0.46875, "step": 92, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 537.3125, "epoch": 0.006400110109421237, "grad_norm": 2.14740082652408, "kl": 0.072265625, "learning_rate": 9.998989351782293e-07, "loss": 0.0, "reward": 1.568123459815979, "reward_std": 0.3851642310619354, "rewards/accuracy_reward": 0.552498459815979, "rewards/format_reward": 0.890625, "rewards/transform_reward": 0.0, "step": 93, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 456.875, "epoch": 0.006468928497694584, "grad_norm": 1.7402743601537103, "kl": 0.078125, "learning_rate": 9.998967501312643e-07, "loss": 0.0, "reward": 2.248718023300171, "reward_std": 0.3175176978111267, "rewards/accuracy_reward": 0.6518429517745972, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.453125, "step": 94, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 465.140625, "epoch": 0.006537746885967931, "grad_norm": 1.318311483047791, "kl": 0.07373046875, "learning_rate": 9.998945417180473e-07, "loss": 0.0, "reward": 1.8390625715255737, "reward_std": 0.41585925221443176, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0546875, "step": 95, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 595.890625, "epoch": 0.006606565274241277, "grad_norm": 1.401197717548961, "kl": 0.060302734375, "learning_rate": 9.99892309938682e-07, "loss": 0.0, "reward": 1.9516829252243042, "reward_std": 0.30760276317596436, "rewards/accuracy_reward": 0.6969954371452332, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.1171875, "step": 96, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 484.59375, "epoch": 0.006675383662514624, "grad_norm": 3.567841783721616, "kl": 0.072265625, "learning_rate": 9.998900547932727e-07, "loss": 0.0, "reward": 2.37754225730896, "reward_std": 0.2091120183467865, "rewards/accuracy_reward": 0.808792233467102, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.40625, "step": 97, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 445.671875, "epoch": 0.0067442020507879705, "grad_norm": 1.2643340282620508, "kl": 0.08349609375, "learning_rate": 9.998877762819242e-07, "loss": 0.0, "reward": 1.9359376430511475, "reward_std": 0.2508181929588318, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0078125, "step": 98, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 434.21875, "epoch": 0.006813020439061317, "grad_norm": 1.3877686425936597, "kl": 0.08447265625, "learning_rate": 9.998854744047437e-07, "loss": 0.0, "reward": 2.0250000953674316, "reward_std": 0.2546413540840149, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.015625, "step": 99, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 454.65625, "epoch": 0.006881838827334664, "grad_norm": 1.2946924336448202, "kl": 0.0888671875, "learning_rate": 9.998831491618383e-07, "loss": 0.0, "reward": 2.1738944053649902, "reward_std": 0.21055853366851807, "rewards/accuracy_reward": 0.7082691192626953, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.328125, "step": 100, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 557.4375, "epoch": 0.0069506572156080105, "grad_norm": 1.4375164444341044, "kl": 0.09130859375, "learning_rate": 9.99880800553317e-07, "loss": -0.0, "reward": 1.9738883972167969, "reward_std": 0.4276154935359955, "rewards/accuracy_reward": 0.49576348066329956, "rewards/format_reward": 0.890625, "rewards/transform_reward": 0.484375, "step": 101, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 453.671875, "epoch": 0.007019475603881357, "grad_norm": 15.946897735918197, "kl": 0.0947265625, "learning_rate": 9.998784285792895e-07, "loss": 0.0, "reward": 2.067415714263916, "reward_std": 0.17350955307483673, "rewards/accuracy_reward": 0.856478214263916, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0234375, "step": 102, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 636.25, "epoch": 0.007088293992154704, "grad_norm": 1.597200146515094, "kl": 0.057861328125, "learning_rate": 9.998760332398665e-07, "loss": 0.0, "reward": 1.8484375476837158, "reward_std": 0.5890048146247864, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.890625, "rewards/transform_reward": 0.0390625, "step": 103, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 657.65625, "epoch": 0.00715711238042805, "grad_norm": 2.5337330686281585, "kl": 0.060546875, "learning_rate": 9.998736145351602e-07, "loss": 0.0, "reward": 1.7859375476837158, "reward_std": 0.5969496965408325, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.84375, "rewards/transform_reward": 0.1015625, "step": 104, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 478.375, "epoch": 0.007225930768701397, "grad_norm": 3.7182296354743234, "kl": 0.0947265625, "learning_rate": 9.998711724652834e-07, "loss": 0.0, "reward": 2.432037830352783, "reward_std": 0.23088276386260986, "rewards/accuracy_reward": 0.8273503184318542, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4453125, "step": 105, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 531.8125, "epoch": 0.007294749156974744, "grad_norm": 1.3326936501183015, "kl": 0.09619140625, "learning_rate": 9.998687070303507e-07, "loss": -0.0, "reward": 2.3765625953674316, "reward_std": 0.33701348304748535, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4765625, "step": 106, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 474.4375, "epoch": 0.00736356754524809, "grad_norm": 2.292876305474341, "kl": 0.099609375, "learning_rate": 9.998662182304768e-07, "loss": 0.0, "reward": 1.8234375715255737, "reward_std": 0.305568665266037, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0078125, "step": 107, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 507.046875, "epoch": 0.007432385933521437, "grad_norm": 1.135166008069742, "kl": 0.09033203125, "learning_rate": 9.998637060657785e-07, "loss": 0.0, "reward": 2.316160202026367, "reward_std": 0.27352410554885864, "rewards/accuracy_reward": 0.8708477020263672, "rewards/format_reward": 0.890625, "rewards/transform_reward": 0.3828125, "step": 108, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 511.484375, "epoch": 0.0075012043217947836, "grad_norm": 1.298816975594669, "kl": 0.099609375, "learning_rate": 9.99861170536373e-07, "loss": 0.0, "reward": 2.1467666625976562, "reward_std": 0.22224542498588562, "rewards/accuracy_reward": 0.5545793175697327, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 109, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 514.765625, "epoch": 0.00757002271006813, "grad_norm": 0.6703014083783713, "kl": 0.10302734375, "learning_rate": 9.998586116423784e-07, "loss": -0.0, "reward": 2.4750001430511475, "reward_std": 0.08017838001251221, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 110, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 553.09375, "epoch": 0.007638841098341477, "grad_norm": 1.1432510260303166, "kl": 0.09228515625, "learning_rate": 9.998560293839151e-07, "loss": 0.0, "reward": 2.310309886932373, "reward_std": 0.12794756889343262, "rewards/accuracy_reward": 0.6665599346160889, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 111, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 638.515625, "epoch": 0.0077076594866148235, "grad_norm": 1.5601915006458769, "kl": 0.0654296875, "learning_rate": 9.998534237611034e-07, "loss": 0.0, "reward": 1.6890625953674316, "reward_std": 0.6728575825691223, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.84375, "rewards/transform_reward": 0.1171875, "step": 112, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 457.71875, "epoch": 0.00777647787488817, "grad_norm": 1.4259226975862107, "kl": 0.10888671875, "learning_rate": 9.99850794774065e-07, "loss": 0.0, "reward": 2.3578124046325684, "reward_std": 0.39022037386894226, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4609375, "step": 113, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 523.46875, "epoch": 0.007845296263161517, "grad_norm": 2.1428139054985595, "kl": 0.09912109375, "learning_rate": 9.99848142422923e-07, "loss": 0.0, "reward": 2.3856325149536133, "reward_std": 0.2571346163749695, "rewards/accuracy_reward": 0.8090700507164001, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.4609375, "step": 114, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 634.65625, "epoch": 0.007914114651434864, "grad_norm": 1.3264731394938303, "kl": 0.061767578125, "learning_rate": 9.998454667078012e-07, "loss": 0.0, "reward": 2.085937738418579, "reward_std": 0.6520329713821411, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.890625, "rewards/transform_reward": 0.1640625, "step": 115, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 544.5625, "epoch": 0.00798293303970821, "grad_norm": 1.3705025428049267, "kl": 0.10205078125, "learning_rate": 9.998427676288247e-07, "loss": 0.0, "reward": 2.488492965698242, "reward_std": 0.2217375487089157, "rewards/accuracy_reward": 0.8541179895401001, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.5, "step": 116, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 646.171875, "epoch": 0.008051751427981558, "grad_norm": 1.1300094714214983, "kl": 0.059814453125, "learning_rate": 9.998400451861197e-07, "loss": 0.0, "reward": 1.5984375476837158, "reward_std": 0.5963996648788452, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.859375, "rewards/transform_reward": 0.0859375, "step": 117, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 593.546875, "epoch": 0.008120569816254903, "grad_norm": 0.7296562856607638, "kl": 0.0712890625, "learning_rate": 9.998372993798136e-07, "loss": 0.0, "reward": 1.9609375, "reward_std": 0.18212249875068665, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0234375, "step": 118, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 647.453125, "epoch": 0.00818938820452825, "grad_norm": 1.0964461828750554, "kl": 0.06298828125, "learning_rate": 9.998345302100346e-07, "loss": 0.0, "reward": 2.0078125, "reward_std": 0.655358076095581, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.875, "rewards/transform_reward": 0.1796875, "step": 119, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 484.734375, "epoch": 0.008258206592801597, "grad_norm": 4.783412917747765, "kl": 0.09912109375, "learning_rate": 9.99831737676912e-07, "loss": 0.0, "reward": 2.3387160301208496, "reward_std": 0.26470181345939636, "rewards/accuracy_reward": 0.724653422832489, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4609375, "step": 120, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 597.46875, "epoch": 0.008327024981074944, "grad_norm": 1.1253600806888158, "kl": 0.0634765625, "learning_rate": 9.998289217805766e-07, "loss": -0.0, "reward": 1.5224275588989258, "reward_std": 0.47979485988616943, "rewards/accuracy_reward": 0.5333649516105652, "rewards/format_reward": 0.796875, "rewards/transform_reward": 0.0546875, "step": 121, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 464.890625, "epoch": 0.00839584336934829, "grad_norm": 1.2666414840380982, "kl": 0.1083984375, "learning_rate": 9.998260825211598e-07, "loss": -0.0, "reward": 2.293750047683716, "reward_std": 0.340684711933136, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.453125, "step": 122, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 579.34375, "epoch": 0.008464661757621637, "grad_norm": 1.289060860620253, "kl": 0.06640625, "learning_rate": 9.998232198987945e-07, "loss": 0.0, "reward": 2.0625, "reward_std": 0.3690018653869629, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.125, "step": 123, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 445.40625, "epoch": 0.008533480145894983, "grad_norm": 1.412553114015288, "kl": 0.1181640625, "learning_rate": 9.998203339136145e-07, "loss": 0.0, "reward": 1.9625000953674316, "reward_std": 0.15208697319030762, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.03125, "step": 124, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 633.5625, "epoch": 0.00860229853416833, "grad_norm": 1.9782472669303215, "kl": 0.06787109375, "learning_rate": 9.998174245657543e-07, "loss": 0.0, "reward": 1.6812833547592163, "reward_std": 0.5773366689682007, "rewards/accuracy_reward": 0.6484708189964294, "rewards/format_reward": 0.84375, "rewards/transform_reward": 0.0546875, "step": 125, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 483.828125, "epoch": 0.008671116922441676, "grad_norm": 2.4229138988662933, "kl": 0.10546875, "learning_rate": 9.998144918553505e-07, "loss": 0.0, "reward": 2.371875047683716, "reward_std": 0.3658888339996338, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.359375, "step": 126, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 478.84375, "epoch": 0.008739935310715024, "grad_norm": 2.2597681514979335, "kl": 0.10986328125, "learning_rate": 9.998115357825396e-07, "loss": 0.0, "reward": 2.3843750953674316, "reward_std": 0.3380052447319031, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.390625, "step": 127, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 503.6875, "epoch": 0.00880875369898837, "grad_norm": 1.775085241840849, "kl": 0.10302734375, "learning_rate": 9.998085563474604e-07, "loss": 0.0, "reward": 2.2333333492279053, "reward_std": 0.32846397161483765, "rewards/accuracy_reward": 0.6114583015441895, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 128, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 585.078125, "epoch": 0.008877572087261717, "grad_norm": 1.2851018637135547, "kl": 0.0712890625, "learning_rate": 9.998055535502515e-07, "loss": 0.0, "reward": 1.843597650527954, "reward_std": 0.31024205684661865, "rewards/accuracy_reward": 0.6404727101325989, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.046875, "step": 129, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 527.75, "epoch": 0.008946390475535063, "grad_norm": 1.2459743461325554, "kl": 0.1005859375, "learning_rate": 9.998025273910538e-07, "loss": 0.0, "reward": 1.9453126192092896, "reward_std": 0.33883893489837646, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0078125, "step": 130, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 523.140625, "epoch": 0.00901520886380841, "grad_norm": 1.322686871357064, "kl": 0.09716796875, "learning_rate": 9.997994778700083e-07, "loss": 0.0, "reward": 2.122774124145508, "reward_std": 0.3148372769355774, "rewards/accuracy_reward": 0.6133990287780762, "rewards/format_reward": 0.921875, "rewards/transform_reward": 0.421875, "step": 131, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 502.484375, "epoch": 0.009084027252081756, "grad_norm": 3.174476923438486, "kl": 0.1064453125, "learning_rate": 9.997964049872578e-07, "loss": 0.0, "reward": 1.8384902477264404, "reward_std": 0.11403562128543854, "rewards/accuracy_reward": 0.6791151762008667, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 132, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 604.734375, "epoch": 0.009152845640355104, "grad_norm": 1.9500664476067027, "kl": 0.07080078125, "learning_rate": 9.99793308742946e-07, "loss": 0.0, "reward": 1.806249976158142, "reward_std": 0.42861005663871765, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.921875, "rewards/transform_reward": 0.140625, "step": 133, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 586.625, "epoch": 0.00922166402862845, "grad_norm": 2.030732912900692, "kl": 0.07373046875, "learning_rate": 9.997901891372177e-07, "loss": 0.0, "reward": 1.9965695142745972, "reward_std": 0.5110511779785156, "rewards/accuracy_reward": 0.7606319189071655, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.1015625, "step": 134, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 487.390625, "epoch": 0.009290482416901797, "grad_norm": 0.8001450279513651, "kl": 0.1005859375, "learning_rate": 9.997870461702182e-07, "loss": -0.0, "reward": 2.5609376430511475, "reward_std": 0.15441180765628815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 135, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 507.953125, "epoch": 0.009359300805175143, "grad_norm": 1.2511218443168597, "kl": 0.1025390625, "learning_rate": 9.997838798420948e-07, "loss": 0.0, "reward": 1.59375, "reward_std": 0.37687623500823975, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 136, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 446.40625, "epoch": 0.00942811919344849, "grad_norm": 0.7162960011523825, "kl": 0.111328125, "learning_rate": 9.997806901529955e-07, "loss": 0.0, "reward": 2.5234375, "reward_std": 0.1295740008354187, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4765625, "step": 137, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 508.09375, "epoch": 0.009496937581721836, "grad_norm": 1.368042190080439, "kl": 0.091796875, "learning_rate": 9.997774771030694e-07, "loss": -0.0, "reward": 2.258765697479248, "reward_std": 0.5583165884017944, "rewards/accuracy_reward": 0.6853281259536743, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.4765625, "step": 138, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 611.671875, "epoch": 0.009565755969995184, "grad_norm": 1.4917696684026394, "kl": 0.0703125, "learning_rate": 9.997742406924667e-07, "loss": 0.0, "reward": 1.6603522300720215, "reward_std": 0.45166802406311035, "rewards/accuracy_reward": 0.5572271347045898, "rewards/format_reward": 0.90625, "rewards/transform_reward": 0.0625, "step": 139, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 582.296875, "epoch": 0.00963457435826853, "grad_norm": 1.4770008493090714, "kl": 0.06884765625, "learning_rate": 9.997709809213384e-07, "loss": 0.0, "reward": 2.050933837890625, "reward_std": 0.3929651975631714, "rewards/accuracy_reward": 0.7665587663650513, "rewards/format_reward": 0.921875, "rewards/transform_reward": 0.203125, "step": 140, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 592.59375, "epoch": 0.009703392746541877, "grad_norm": 2.9635554417490555, "kl": 0.06640625, "learning_rate": 9.997676977898372e-07, "loss": 0.0, "reward": 1.738048791885376, "reward_std": 0.36405616998672485, "rewards/accuracy_reward": 0.644298791885376, "rewards/format_reward": 0.859375, "rewards/transform_reward": 0.09375, "step": 141, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 476.78125, "epoch": 0.009772211134815223, "grad_norm": 1.456142122300616, "kl": 0.1103515625, "learning_rate": 9.997643912981165e-07, "loss": 0.0, "reward": 2.4390625953674316, "reward_std": 0.28437888622283936, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3515625, "step": 142, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 538.0625, "epoch": 0.00984102952308857, "grad_norm": 1.5846388149314978, "kl": 0.09765625, "learning_rate": 9.997610614463307e-07, "loss": 0.0, "reward": 2.409374952316284, "reward_std": 0.3857174217700958, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.484375, "step": 143, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 493.046875, "epoch": 0.009909847911361916, "grad_norm": 1.9042270919318482, "kl": 0.1083984375, "learning_rate": 9.997577082346354e-07, "loss": 0.0, "reward": 2.2141826152801514, "reward_std": 0.447122186422348, "rewards/accuracy_reward": 0.6813701391220093, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.4140625, "step": 144, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 501.78125, "epoch": 0.009978666299635263, "grad_norm": 1.1793686262687275, "kl": 0.09765625, "learning_rate": 9.997543316631876e-07, "loss": 0.0, "reward": 2.542187452316284, "reward_std": 0.1975996196269989, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 145, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 459.078125, "epoch": 0.01004748468790861, "grad_norm": 1.4154030499139085, "kl": 0.12451171875, "learning_rate": 9.997509317321451e-07, "loss": 0.0, "reward": 2.418750047683716, "reward_std": 0.33315739035606384, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.40625, "step": 146, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 459.59375, "epoch": 0.010116303076181957, "grad_norm": 0.8105153730794362, "kl": 0.1083984375, "learning_rate": 9.997475084416666e-07, "loss": 0.0, "reward": 2.245312452316284, "reward_std": 0.15481634438037872, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4765625, "step": 147, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 631.25, "epoch": 0.010185121464455302, "grad_norm": 1.925621708062065, "kl": 0.06005859375, "learning_rate": 9.997440617919123e-07, "loss": -0.0, "reward": 1.7437586784362793, "reward_std": 0.30714040994644165, "rewards/accuracy_reward": 0.6625086069107056, "rewards/format_reward": 0.859375, "rewards/transform_reward": 0.0625, "step": 148, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 472.703125, "epoch": 0.01025393985272865, "grad_norm": 2.0764276379680315, "kl": 0.10693359375, "learning_rate": 9.997405917830431e-07, "loss": 0.0, "reward": 2.4546873569488525, "reward_std": 0.23072049021720886, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4453125, "step": 149, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 589.984375, "epoch": 0.010322758241001996, "grad_norm": 1.5390579741262154, "kl": 0.072265625, "learning_rate": 9.997370984152213e-07, "loss": 0.0, "reward": 1.9941564798355103, "reward_std": 0.29300519824028015, "rewards/accuracy_reward": 0.8707189559936523, "rewards/format_reward": 0.890625, "rewards/transform_reward": 0.0546875, "step": 150, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 527.90625, "epoch": 0.010391576629275343, "grad_norm": 1.0029640241974047, "kl": 0.09326171875, "learning_rate": 9.997335816886103e-07, "loss": -0.0, "reward": 2.232590436935425, "reward_std": 0.2714734971523285, "rewards/accuracy_reward": 0.7107153534889221, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.421875, "step": 151, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 483.765625, "epoch": 0.010460395017548689, "grad_norm": 1.1556217191876477, "kl": 0.10107421875, "learning_rate": 9.997300416033746e-07, "loss": 0.0, "reward": 2.015625, "reward_std": 0.25758394598960876, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 152, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 562.28125, "epoch": 0.010529213405822037, "grad_norm": 1.9251822242159358, "kl": 0.076171875, "learning_rate": 9.99726478159679e-07, "loss": 0.0, "reward": 2.2718749046325684, "reward_std": 0.3141992688179016, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.125, "step": 153, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 594.15625, "epoch": 0.010598031794095382, "grad_norm": 1.8129075191506272, "kl": 0.06787109375, "learning_rate": 9.997228913576909e-07, "loss": 0.0, "reward": 1.8718750476837158, "reward_std": 0.48350459337234497, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0625, "step": 154, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 567.984375, "epoch": 0.01066685018236873, "grad_norm": 1.3018655804862163, "kl": 0.07080078125, "learning_rate": 9.997192811975775e-07, "loss": 0.0, "reward": 2.0296874046325684, "reward_std": 0.44238588213920593, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0859375, "step": 155, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 471.375, "epoch": 0.010735668570642076, "grad_norm": 1.8006816637996954, "kl": 0.10791015625, "learning_rate": 9.997156476795077e-07, "loss": -0.0, "reward": 2.3868093490600586, "reward_std": 0.3005393147468567, "rewards/accuracy_reward": 0.7555592656135559, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.46875, "step": 156, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 453.0, "epoch": 0.010804486958915423, "grad_norm": 1.3508240958139068, "kl": 0.11279296875, "learning_rate": 9.997119908036511e-07, "loss": 0.0, "reward": 2.495312452316284, "reward_std": 0.2574208378791809, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3515625, "step": 157, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 538.703125, "epoch": 0.010873305347188769, "grad_norm": 1.3474814513383317, "kl": 0.0830078125, "learning_rate": 9.99708310570179e-07, "loss": 0.0, "reward": 1.8953793048858643, "reward_std": 0.26271742582321167, "rewards/accuracy_reward": 0.7391293048858643, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 158, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 589.515625, "epoch": 0.010942123735462116, "grad_norm": 1.73535561761779, "kl": 0.0712890625, "learning_rate": 9.99704606979263e-07, "loss": -0.0, "reward": 1.7874741554260254, "reward_std": 0.4535794258117676, "rewards/accuracy_reward": 0.6734116077423096, "rewards/format_reward": 0.890625, "rewards/transform_reward": 0.0703125, "step": 159, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 494.515625, "epoch": 0.011010942123735462, "grad_norm": 1.0667553537562675, "kl": 0.10546875, "learning_rate": 9.997008800310765e-07, "loss": -0.0, "reward": 2.467466354370117, "reward_std": 0.24516668915748596, "rewards/accuracy_reward": 0.8377788066864014, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4453125, "step": 160, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 476.375, "epoch": 0.01107976051200881, "grad_norm": 1.0935315838404167, "kl": 0.1044921875, "learning_rate": 9.996971297257937e-07, "loss": -0.0, "reward": 2.309375047683716, "reward_std": 0.2440408170223236, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 161, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 455.140625, "epoch": 0.011148578900282155, "grad_norm": 1.079173815962213, "kl": 0.111328125, "learning_rate": 9.996933560635897e-07, "loss": -0.0, "reward": 2.4390625953674316, "reward_std": 0.2923761010169983, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4453125, "step": 162, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 467.359375, "epoch": 0.011217397288555503, "grad_norm": 1.6745647757696664, "kl": 0.12255859375, "learning_rate": 9.996895590446412e-07, "loss": 0.0, "reward": 2.3375000953674316, "reward_std": 0.35792866349220276, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.34375, "step": 163, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 564.109375, "epoch": 0.011286215676828849, "grad_norm": 0.9667821761851395, "kl": 0.07861328125, "learning_rate": 9.996857386691254e-07, "loss": 0.0, "reward": 1.767187476158142, "reward_std": 0.22923904657363892, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0234375, "step": 164, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 575.265625, "epoch": 0.011355034065102196, "grad_norm": 2.296251711753088, "kl": 0.08056640625, "learning_rate": 9.996818949372209e-07, "loss": -0.0, "reward": 1.9127171039581299, "reward_std": 0.25786900520324707, "rewards/accuracy_reward": 0.7127171754837036, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.046875, "step": 165, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 573.0625, "epoch": 0.011423852453375542, "grad_norm": 2.4277277800861774, "kl": 0.08642578125, "learning_rate": 9.996780278491077e-07, "loss": 0.0, "reward": 1.8031359910964966, "reward_std": 0.19792093336582184, "rewards/accuracy_reward": 0.6812610626220703, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 166, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 466.890625, "epoch": 0.01149267084164889, "grad_norm": 0.9171232226385567, "kl": 0.13671875, "learning_rate": 9.99674137404966e-07, "loss": 0.0, "reward": 2.3062500953674316, "reward_std": 0.09127141535282135, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.40625, "step": 167, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 523.078125, "epoch": 0.011561489229922235, "grad_norm": 0.7496107084387834, "kl": 0.1025390625, "learning_rate": 9.996702236049783e-07, "loss": -0.0, "reward": 2.4919252395629883, "reward_std": 0.1965782344341278, "rewards/accuracy_reward": 0.8372378349304199, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.4921875, "step": 168, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 586.3125, "epoch": 0.011630307618195583, "grad_norm": 1.600785568921222, "kl": 0.08837890625, "learning_rate": 9.996662864493269e-07, "loss": 0.0, "reward": 1.8468750715255737, "reward_std": 0.27220115065574646, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0625, "step": 169, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 574.640625, "epoch": 0.011699126006468928, "grad_norm": 1.4061324124696792, "kl": 0.08935546875, "learning_rate": 9.996623259381963e-07, "loss": 0.0, "reward": 2.0671873092651367, "reward_std": 0.34852132201194763, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1484375, "step": 170, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 426.921875, "epoch": 0.011767944394742276, "grad_norm": 1.223034545454587, "kl": 0.1337890625, "learning_rate": 9.996583420717712e-07, "loss": 0.0, "reward": 2.5171875953674316, "reward_std": 0.07144056260585785, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4765625, "step": 171, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 475.15625, "epoch": 0.011836762783015622, "grad_norm": 1.0865838964183385, "kl": 0.12451171875, "learning_rate": 9.996543348502382e-07, "loss": -0.0, "reward": 1.9507089853286743, "reward_std": 0.17472712695598602, "rewards/accuracy_reward": 0.7819589376449585, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 172, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 528.3125, "epoch": 0.01190558117128897, "grad_norm": 2.8832969316547494, "kl": 0.09912109375, "learning_rate": 9.996503042737844e-07, "loss": 0.0, "reward": 2.3775253295898438, "reward_std": 0.4073331952095032, "rewards/accuracy_reward": 0.7462754249572754, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.5, "step": 173, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 476.734375, "epoch": 0.011974399559562315, "grad_norm": 2.9379403891388085, "kl": 0.11767578125, "learning_rate": 9.996462503425983e-07, "loss": 0.0, "reward": 2.3062500953674316, "reward_std": 0.34717077016830444, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 174, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 483.984375, "epoch": 0.012043217947835663, "grad_norm": 1.2418461976056225, "kl": 0.14453125, "learning_rate": 9.996421730568695e-07, "loss": 0.0, "reward": 2.212165117263794, "reward_std": 0.25767362117767334, "rewards/accuracy_reward": 0.5965402126312256, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 175, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 484.375, "epoch": 0.012112036336109008, "grad_norm": 15.630829258595751, "kl": 0.1435546875, "learning_rate": 9.996380724167882e-07, "loss": 0.0, "reward": 2.3519134521484375, "reward_std": 0.3772566318511963, "rewards/accuracy_reward": 0.7284760475158691, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4765625, "step": 176, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 464.375, "epoch": 0.012180854724382356, "grad_norm": 2.639421770951722, "kl": 0.1318359375, "learning_rate": 9.996339484225467e-07, "loss": 0.0, "reward": 2.450000047683716, "reward_std": 0.28976207971572876, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 177, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 477.4375, "epoch": 0.012249673112655702, "grad_norm": 1.253658934812521, "kl": 0.1474609375, "learning_rate": 9.99629801074337e-07, "loss": 0.0, "reward": 2.1977548599243164, "reward_std": 0.2701722979545593, "rewards/accuracy_reward": 0.7305673956871033, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3046875, "step": 178, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 514.640625, "epoch": 0.012318491500929049, "grad_norm": 1.0892656489160537, "kl": 0.10693359375, "learning_rate": 9.996256303723533e-07, "loss": 0.0, "reward": 1.9249999523162842, "reward_std": 0.24976977705955505, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 179, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 500.234375, "epoch": 0.012387309889202395, "grad_norm": 2.4354022612386768, "kl": 0.115234375, "learning_rate": 9.996214363167907e-07, "loss": 0.0, "reward": 1.9486531019210815, "reward_std": 0.2033180147409439, "rewards/accuracy_reward": 0.7861530184745789, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 180, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 606.09375, "epoch": 0.012456128277475742, "grad_norm": 1.5744297084858185, "kl": 0.08154296875, "learning_rate": 9.996172189078452e-07, "loss": 0.0, "reward": 1.8104228973388672, "reward_std": 0.5212111473083496, "rewards/accuracy_reward": 0.6197978854179382, "rewards/format_reward": 0.921875, "rewards/transform_reward": 0.125, "step": 181, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 468.53125, "epoch": 0.012524946665749088, "grad_norm": 8.806517960234068, "kl": 0.1494140625, "learning_rate": 9.996129781457136e-07, "loss": -0.0, "reward": 2.2641761302948, "reward_std": 0.31594473123550415, "rewards/accuracy_reward": 0.7219885587692261, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3828125, "step": 182, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 516.1875, "epoch": 0.012593765054022436, "grad_norm": 1.1267706782025793, "kl": 0.1435546875, "learning_rate": 9.996087140305944e-07, "loss": -0.0, "reward": 2.1993026733398438, "reward_std": 0.3260611891746521, "rewards/accuracy_reward": 0.7446150779724121, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.3046875, "step": 183, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 480.203125, "epoch": 0.012662583442295781, "grad_norm": 1.5455870333036756, "kl": 0.1298828125, "learning_rate": 9.99604426562687e-07, "loss": -0.0, "reward": 2.4593749046325684, "reward_std": 0.20125514268875122, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.390625, "step": 184, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 594.453125, "epoch": 0.012731401830569129, "grad_norm": 1.48509697359289, "kl": 0.08203125, "learning_rate": 9.996001157421917e-07, "loss": 0.0, "reward": 2.0531249046325684, "reward_std": 0.4182473123073578, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.078125, "step": 185, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 595.625, "epoch": 0.012800220218842475, "grad_norm": 1.979014885070394, "kl": 0.08349609375, "learning_rate": 9.995957815693099e-07, "loss": -0.0, "reward": 1.9324504137039185, "reward_std": 0.3245902955532074, "rewards/accuracy_reward": 0.7605754137039185, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.0625, "step": 186, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 579.953125, "epoch": 0.012869038607115822, "grad_norm": 0.5951611478487445, "kl": 0.08349609375, "learning_rate": 9.995914240442443e-07, "loss": -0.0, "reward": 1.8738701343536377, "reward_std": 0.07069871574640274, "rewards/accuracy_reward": 0.7238701581954956, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 187, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 612.59375, "epoch": 0.012937856995389168, "grad_norm": 2.569609088739357, "kl": 0.0791015625, "learning_rate": 9.995870431671986e-07, "loss": 0.0, "reward": 2.052428960800171, "reward_std": 0.2660844922065735, "rewards/accuracy_reward": 0.8711789846420288, "rewards/format_reward": 0.921875, "rewards/transform_reward": 0.078125, "step": 188, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 526.453125, "epoch": 0.013006675383662515, "grad_norm": 0.7308755237201566, "kl": 0.1396484375, "learning_rate": 9.995826389383775e-07, "loss": 0.0, "reward": 1.6692240238189697, "reward_std": 0.2190026342868805, "rewards/accuracy_reward": 0.5832864046096802, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.0078125, "step": 189, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 487.703125, "epoch": 0.013075493771935861, "grad_norm": 2.234715454353881, "kl": 0.1533203125, "learning_rate": 9.995782113579868e-07, "loss": -0.0, "reward": 2.3359375, "reward_std": 0.286685973405838, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4765625, "step": 190, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 570.46875, "epoch": 0.013144312160209209, "grad_norm": 1.157001205799331, "kl": 0.1015625, "learning_rate": 9.995737604262336e-07, "loss": -0.0, "reward": 1.7215254306793213, "reward_std": 0.5464913845062256, "rewards/accuracy_reward": 0.33402547240257263, "rewards/format_reward": 0.828125, "rewards/transform_reward": 0.484375, "step": 191, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 591.0625, "epoch": 0.013213130548482555, "grad_norm": 1.9568089811698293, "kl": 0.08984375, "learning_rate": 9.995692861433257e-07, "loss": 0.0, "reward": 2.098437547683716, "reward_std": 0.4096197187900543, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.1015625, "step": 192, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 520.15625, "epoch": 0.013281948936755902, "grad_norm": 0.0, "kl": 0.126953125, "learning_rate": 9.995647885094726e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 193, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 497.03125, "epoch": 0.013350767325029248, "grad_norm": 5.8908397171549485, "kl": 0.1513671875, "learning_rate": 9.995602675248844e-07, "loss": 0.0, "reward": 2.327733039855957, "reward_std": 0.2056005597114563, "rewards/accuracy_reward": 0.7230456471443176, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4453125, "step": 194, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 634.0625, "epoch": 0.013419585713302595, "grad_norm": 1.433199144185773, "kl": 0.0751953125, "learning_rate": 9.995557231897724e-07, "loss": -0.0, "reward": 1.657145619392395, "reward_std": 0.4699442684650421, "rewards/accuracy_reward": 0.5618330836296082, "rewards/format_reward": 0.875, "rewards/transform_reward": 0.0859375, "step": 195, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 550.578125, "epoch": 0.013488404101575941, "grad_norm": 1.3049108546759496, "kl": 0.11376953125, "learning_rate": 9.995511555043487e-07, "loss": 0.0, "reward": 2.0863184928894043, "reward_std": 0.26585903763771057, "rewards/accuracy_reward": 0.5144436359405518, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.5, "step": 196, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 624.296875, "epoch": 0.013557222489849289, "grad_norm": 4.443108574258184, "kl": 0.0908203125, "learning_rate": 9.995465644688275e-07, "loss": 0.0, "reward": 2.0875000953674316, "reward_std": 0.42837637662887573, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.109375, "step": 197, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 507.546875, "epoch": 0.013626040878122634, "grad_norm": 1.6910928537190244, "kl": 0.1494140625, "learning_rate": 9.995419500834226e-07, "loss": -0.0, "reward": 2.3703126907348633, "reward_std": 0.3231234550476074, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4140625, "step": 198, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 503.875, "epoch": 0.013694859266395982, "grad_norm": 0.8823905802178876, "kl": 0.11083984375, "learning_rate": 9.995373123483503e-07, "loss": 0.0, "reward": 1.9343749284744263, "reward_std": 0.16666369140148163, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 199, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 523.328125, "epoch": 0.013763677654669328, "grad_norm": 1.1755381726094882, "kl": 0.119140625, "learning_rate": 9.995326512638271e-07, "loss": -0.0, "reward": 2.0500001907348633, "reward_std": 0.22470125555992126, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 200, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 505.328125, "epoch": 0.013832496042942675, "grad_norm": 1.0699044719757282, "kl": 0.16015625, "learning_rate": 9.995279668300708e-07, "loss": -0.0, "reward": 2.078125, "reward_std": 0.412932425737381, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.40625, "step": 201, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 510.90625, "epoch": 0.013901314431216021, "grad_norm": 1.431375296479211, "kl": 0.11962890625, "learning_rate": 9.995232590473006e-07, "loss": -0.0, "reward": 2.5546162128448486, "reward_std": 0.1494200974702835, "rewards/accuracy_reward": 0.8671161532402039, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 202, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 483.765625, "epoch": 0.013970132819489368, "grad_norm": 0.9008010305001912, "kl": 0.169921875, "learning_rate": 9.995185279157365e-07, "loss": 0.0, "reward": 2.598437547683716, "reward_std": 0.20579548180103302, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 203, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 554.265625, "epoch": 0.014038951207762714, "grad_norm": 0.8187348286268713, "kl": 0.09814453125, "learning_rate": 9.995137734355994e-07, "loss": 0.0, "reward": 2.33121395111084, "reward_std": 0.19871650636196136, "rewards/accuracy_reward": 0.6968388557434082, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.5, "step": 204, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 604.109375, "epoch": 0.014107769596036062, "grad_norm": 1.2826323051248447, "kl": 0.08935546875, "learning_rate": 9.995089956071118e-07, "loss": -0.0, "reward": 1.756250023841858, "reward_std": 0.44765400886535645, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.078125, "step": 205, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 512.96875, "epoch": 0.014176587984309407, "grad_norm": 2.424076389607338, "kl": 0.1181640625, "learning_rate": 9.995041944304968e-07, "loss": -0.0, "reward": 2.378124952316284, "reward_std": 0.27963730692863464, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 206, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 511.640625, "epoch": 0.014245406372582755, "grad_norm": 1.6973717831759585, "kl": 0.1533203125, "learning_rate": 9.99499369905979e-07, "loss": 0.0, "reward": 2.0516133308410645, "reward_std": 0.13595569133758545, "rewards/accuracy_reward": 0.876613438129425, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 207, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 604.71875, "epoch": 0.0143142247608561, "grad_norm": 1.0964471809657423, "kl": 0.09033203125, "learning_rate": 9.99494522033784e-07, "loss": 0.0, "reward": 1.6595810651779175, "reward_std": 0.40608978271484375, "rewards/accuracy_reward": 0.5923935174942017, "rewards/format_reward": 0.890625, "rewards/transform_reward": 0.0546875, "step": 208, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 463.15625, "epoch": 0.014383043149129448, "grad_norm": 1.0245889132950443, "kl": 0.166015625, "learning_rate": 9.99489650814138e-07, "loss": 0.0, "reward": 1.9781248569488525, "reward_std": 0.24429605901241302, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 209, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 613.953125, "epoch": 0.014451861537402794, "grad_norm": 3.2259040570938624, "kl": 0.09130859375, "learning_rate": 9.994847562472693e-07, "loss": -0.0, "reward": 1.90625, "reward_std": 0.488470196723938, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.171875, "step": 210, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 490.0, "epoch": 0.014520679925676142, "grad_norm": 0.9594884994567405, "kl": 0.1630859375, "learning_rate": 9.994798383334062e-07, "loss": 0.0, "reward": 2.370312452316284, "reward_std": 0.17235727608203888, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 211, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 521.09375, "epoch": 0.014589498313949487, "grad_norm": 1.062508849775351, "kl": 0.146484375, "learning_rate": 9.994748970727786e-07, "loss": 0.0, "reward": 2.142301321029663, "reward_std": 0.24214798212051392, "rewards/accuracy_reward": 0.5938637256622314, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.4296875, "step": 212, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 405.234375, "epoch": 0.014658316702222835, "grad_norm": 1.7489329803058753, "kl": 0.1572265625, "learning_rate": 9.994699324656175e-07, "loss": 0.0, "reward": 2.457812547683716, "reward_std": 0.06197666376829147, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4296875, "step": 213, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 508.015625, "epoch": 0.01472713509049618, "grad_norm": 1.1472552635813287, "kl": 0.1474609375, "learning_rate": 9.994649445121553e-07, "loss": 0.0, "reward": 2.160064935684204, "reward_std": 0.3209501802921295, "rewards/accuracy_reward": 0.6413149237632751, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 214, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 465.921875, "epoch": 0.014795953478769528, "grad_norm": 1.9432927412687695, "kl": 0.130859375, "learning_rate": 9.994599332126245e-07, "loss": 0.0, "reward": 1.818750023841858, "reward_std": 0.4266121983528137, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 215, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 456.484375, "epoch": 0.014864771867042874, "grad_norm": 1.1195819592916871, "kl": 0.1728515625, "learning_rate": 9.9945489856726e-07, "loss": 0.0, "reward": 1.8919333219528198, "reward_std": 0.19628220796585083, "rewards/accuracy_reward": 0.707558274269104, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.03125, "step": 216, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 467.265625, "epoch": 0.014933590255316221, "grad_norm": 3.3383257061546128, "kl": 0.12255859375, "learning_rate": 9.994498405762967e-07, "loss": 0.0, "reward": 1.8856481313705444, "reward_std": 0.06304100155830383, "rewards/accuracy_reward": 0.7418981194496155, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 217, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 590.84375, "epoch": 0.015002408643589567, "grad_norm": 1.3703407674654724, "kl": 0.09765625, "learning_rate": 9.994447592399714e-07, "loss": 0.0, "reward": 1.7603089809417725, "reward_std": 0.34137070178985596, "rewards/accuracy_reward": 0.6368714570999146, "rewards/format_reward": 0.90625, "rewards/transform_reward": 0.0859375, "step": 218, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 576.6875, "epoch": 0.015071227031862915, "grad_norm": 1.8854235522360678, "kl": 0.09814453125, "learning_rate": 9.994396545585212e-07, "loss": 0.0, "reward": 1.8341633081436157, "reward_std": 0.3414686322212219, "rewards/accuracy_reward": 0.6029133200645447, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.09375, "step": 219, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 507.390625, "epoch": 0.01514004542013626, "grad_norm": 0.7245248155382039, "kl": 0.1318359375, "learning_rate": 9.994345265321849e-07, "loss": -0.0, "reward": 2.4025609493255615, "reward_std": 0.144331693649292, "rewards/accuracy_reward": 0.7431858777999878, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 220, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 556.609375, "epoch": 0.015208863808409608, "grad_norm": 1.783275747844706, "kl": 0.1103515625, "learning_rate": 9.994293751612023e-07, "loss": 0.0, "reward": 1.7890625, "reward_std": 0.25943005084991455, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0078125, "step": 221, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 521.234375, "epoch": 0.015277682196682954, "grad_norm": 1.6364694436067697, "kl": 0.1279296875, "learning_rate": 9.994242004458138e-07, "loss": 0.0, "reward": 2.2965407371520996, "reward_std": 0.27397316694259644, "rewards/accuracy_reward": 0.6496659517288208, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 222, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 596.421875, "epoch": 0.015346500584956301, "grad_norm": 1.4151867039856303, "kl": 0.09765625, "learning_rate": 9.994190023862617e-07, "loss": 0.0, "reward": 1.8039871454238892, "reward_std": 0.4010041356086731, "rewards/accuracy_reward": 0.7008620500564575, "rewards/format_reward": 0.921875, "rewards/transform_reward": 0.015625, "step": 223, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 595.109375, "epoch": 0.015415318973229647, "grad_norm": 1.5240701704052986, "kl": 0.09912109375, "learning_rate": 9.994137809827887e-07, "loss": -0.0, "reward": 1.644148588180542, "reward_std": 0.37946438789367676, "rewards/accuracy_reward": 0.5566484928131104, "rewards/format_reward": 0.921875, "rewards/transform_reward": 0.0, "step": 224, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 499.859375, "epoch": 0.015484137361502994, "grad_norm": 0.9476529840821608, "kl": 0.13671875, "learning_rate": 9.99408536235639e-07, "loss": -0.0, "reward": 2.5356369018554688, "reward_std": 0.015261968597769737, "rewards/accuracy_reward": 0.835637092590332, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 225, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 481.359375, "epoch": 0.01555295574977634, "grad_norm": 1.6567263176193576, "kl": 0.1357421875, "learning_rate": 9.99403268145058e-07, "loss": 0.0, "reward": 1.8434163331985474, "reward_std": 0.2594913840293884, "rewards/accuracy_reward": 0.6934162974357605, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 226, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 555.90625, "epoch": 0.015621774138049688, "grad_norm": 1.4845700095468262, "kl": 0.1142578125, "learning_rate": 9.993979767112912e-07, "loss": 0.0, "reward": 2.086851119995117, "reward_std": 0.27590593695640564, "rewards/accuracy_reward": 0.774351179599762, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.140625, "step": 227, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 558.421875, "epoch": 0.015690592526323034, "grad_norm": 4.5894972057571986, "kl": 0.11474609375, "learning_rate": 9.993926619345868e-07, "loss": 0.0, "reward": 2.2874999046325684, "reward_std": 0.2463010847568512, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.125, "step": 228, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 502.203125, "epoch": 0.01575941091459638, "grad_norm": 0.9928583193501888, "kl": 0.1357421875, "learning_rate": 9.993873238151925e-07, "loss": -0.0, "reward": 1.78382408618927, "reward_std": 0.2155984491109848, "rewards/accuracy_reward": 0.6306990385055542, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 229, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 569.828125, "epoch": 0.01582822930286973, "grad_norm": 1.750219987515313, "kl": 0.11328125, "learning_rate": 9.993819623533583e-07, "loss": 0.0, "reward": 1.8615379333496094, "reward_std": 0.48095351457595825, "rewards/accuracy_reward": 0.6427879333496094, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.125, "step": 230, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 506.921875, "epoch": 0.015897047691143074, "grad_norm": 1.1429048824180308, "kl": 0.140625, "learning_rate": 9.993765775493347e-07, "loss": 0.0, "reward": 2.012500047683716, "reward_std": 0.24493902921676636, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 231, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 534.890625, "epoch": 0.01596586607941642, "grad_norm": 1.3402736588104325, "kl": 0.125, "learning_rate": 9.993711694033732e-07, "loss": 0.0, "reward": 1.9055769443511963, "reward_std": 0.41032344102859497, "rewards/accuracy_reward": 0.6243269443511963, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.140625, "step": 232, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 501.8125, "epoch": 0.016034684467689766, "grad_norm": 1.3736082694105014, "kl": 0.1376953125, "learning_rate": 9.993657379157267e-07, "loss": 0.0, "reward": 1.8376654386520386, "reward_std": 0.1210813894867897, "rewards/accuracy_reward": 0.6814154386520386, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 233, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 507.125, "epoch": 0.016103502855963115, "grad_norm": 1.3364164869377662, "kl": 0.1279296875, "learning_rate": 9.993602830866493e-07, "loss": -0.0, "reward": 2.267426013946533, "reward_std": 0.21262799203395844, "rewards/accuracy_reward": 0.639301061630249, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 234, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 564.0, "epoch": 0.01617232124423646, "grad_norm": 1.5966984359238228, "kl": 0.1142578125, "learning_rate": 9.993548049163957e-07, "loss": 0.0, "reward": 2.090989828109741, "reward_std": 0.28679484128952026, "rewards/accuracy_reward": 0.8019272685050964, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.1171875, "step": 235, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 472.0, "epoch": 0.016241139632509807, "grad_norm": 0.9771565960926728, "kl": 0.1416015625, "learning_rate": 9.99349303405222e-07, "loss": -0.0, "reward": 2.3546600341796875, "reward_std": 0.1650097519159317, "rewards/accuracy_reward": 0.6890348792076111, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 236, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 572.75, "epoch": 0.016309958020783152, "grad_norm": 3.1167569508074675, "kl": 0.1181640625, "learning_rate": 9.993437785533854e-07, "loss": 0.0, "reward": 2.0875000953674316, "reward_std": 0.2367314100265503, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 237, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 458.671875, "epoch": 0.0163787764090565, "grad_norm": 0.6577340723442229, "kl": 0.1826171875, "learning_rate": 9.99338230361144e-07, "loss": -0.0, "reward": 2.6171875, "reward_std": 0.10227546095848083, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 238, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 498.34375, "epoch": 0.016447594797329847, "grad_norm": 2.1178531281264754, "kl": 0.1484375, "learning_rate": 9.993326588287574e-07, "loss": 0.0, "reward": 1.9026072025299072, "reward_std": 0.30171746015548706, "rewards/accuracy_reward": 0.7651071548461914, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 239, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 555.9375, "epoch": 0.016516413185603193, "grad_norm": 2.998274597375034, "kl": 0.1328125, "learning_rate": 9.993270639564858e-07, "loss": 0.0, "reward": 1.8203706741333008, "reward_std": 0.23973025381565094, "rewards/accuracy_reward": 0.7094330787658691, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.0078125, "step": 240, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 505.234375, "epoch": 0.01658523157387654, "grad_norm": 1.4887474493248076, "kl": 0.134765625, "learning_rate": 9.99321445744591e-07, "loss": 0.0, "reward": 2.135488510131836, "reward_std": 0.2654191553592682, "rewards/accuracy_reward": 0.5526760816574097, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.4921875, "step": 241, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 597.734375, "epoch": 0.016654049962149888, "grad_norm": 2.011601692121965, "kl": 0.1220703125, "learning_rate": 9.993158041933353e-07, "loss": -0.0, "reward": 2.0053024291992188, "reward_std": 0.29236793518066406, "rewards/accuracy_reward": 0.7740524411201477, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.078125, "step": 242, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 472.328125, "epoch": 0.016722868350423234, "grad_norm": 0.8984323624201693, "kl": 0.181640625, "learning_rate": 9.993101393029826e-07, "loss": -0.0, "reward": 2.112499952316284, "reward_std": 0.14699368178844452, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 243, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 496.46875, "epoch": 0.01679168673869658, "grad_norm": 2.4964324673515526, "kl": 0.140625, "learning_rate": 9.993044510737974e-07, "loss": 0.0, "reward": 2.400068521499634, "reward_std": 0.2404705286026001, "rewards/accuracy_reward": 0.7281934022903442, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 244, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 539.421875, "epoch": 0.016860505126969925, "grad_norm": 1.2786893830304553, "kl": 0.142578125, "learning_rate": 9.99298739506046e-07, "loss": 0.0, "reward": 1.8549938201904297, "reward_std": 0.32745492458343506, "rewards/accuracy_reward": 0.5253064036369324, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1796875, "step": 245, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 512.859375, "epoch": 0.016929323515243275, "grad_norm": 1.3728963591390688, "kl": 0.1455078125, "learning_rate": 9.99293004599995e-07, "loss": -0.0, "reward": 2.2780745029449463, "reward_std": 0.35676509141921997, "rewards/accuracy_reward": 0.6546369194984436, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 246, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 522.890625, "epoch": 0.01699814190351662, "grad_norm": 1.4395813180136796, "kl": 0.13671875, "learning_rate": 9.992872463559127e-07, "loss": 0.0, "reward": 2.4121241569519043, "reward_std": 0.25787419080734253, "rewards/accuracy_reward": 0.7464991807937622, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 247, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 490.640625, "epoch": 0.017066960291789966, "grad_norm": 0.0, "kl": 0.150390625, "learning_rate": 9.992814647740683e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 248, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 472.28125, "epoch": 0.017135778680063312, "grad_norm": 1.7361891678125116, "kl": 0.171875, "learning_rate": 9.992756598547316e-07, "loss": 0.0, "reward": 2.4937500953674316, "reward_std": 0.28282630443573, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 249, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 499.875, "epoch": 0.01720459706833666, "grad_norm": 1.403735780472445, "kl": 0.1552734375, "learning_rate": 9.992698315981743e-07, "loss": -0.0, "reward": 2.0640625953674316, "reward_std": 0.24822963774204254, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0546875, "step": 250, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 504.546875, "epoch": 0.017273415456610007, "grad_norm": 0.967383766219616, "kl": 0.173828125, "learning_rate": 9.99263980004669e-07, "loss": -0.0, "reward": 2.4303340911865234, "reward_std": 0.24330654740333557, "rewards/accuracy_reward": 0.7990840673446655, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.453125, "step": 251, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 493.40625, "epoch": 0.017342233844883353, "grad_norm": 1.2716413209829684, "kl": 0.15234375, "learning_rate": 9.992581050744888e-07, "loss": -0.0, "reward": 2.5038485527038574, "reward_std": 0.1349901258945465, "rewards/accuracy_reward": 0.822598397731781, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 252, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 492.484375, "epoch": 0.0174110522331567, "grad_norm": 1.0726588170721068, "kl": 0.15234375, "learning_rate": 9.992522068079085e-07, "loss": 0.0, "reward": 2.4107959270477295, "reward_std": 0.24507643282413483, "rewards/accuracy_reward": 0.7545458674430847, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 253, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 531.703125, "epoch": 0.017479870621430048, "grad_norm": 2.362718752485347, "kl": 0.1494140625, "learning_rate": 9.992462852052039e-07, "loss": -0.0, "reward": 2.1828126907348633, "reward_std": 0.20699402689933777, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1328125, "step": 254, "temperature": 1.0 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 516.140625, "epoch": 0.017548689009703394, "grad_norm": 1.1033901726267579, "kl": 0.1640625, "learning_rate": 9.992403402666516e-07, "loss": -0.0, "reward": 1.4333409070968628, "reward_std": 0.3997887969017029, "rewards/accuracy_reward": 0.34427836537361145, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0078125, "step": 255, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 474.109375, "epoch": 0.01761750739797674, "grad_norm": 1.8779350685922014, "kl": 0.1669921875, "learning_rate": 9.992343719925297e-07, "loss": 0.0, "reward": 2.2757275104522705, "reward_std": 0.1973811388015747, "rewards/accuracy_reward": 0.7444774508476257, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.40625, "step": 256, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 502.671875, "epoch": 0.017686325786250085, "grad_norm": 0.832207422839695, "kl": 0.1572265625, "learning_rate": 9.99228380383117e-07, "loss": 0.0, "reward": 1.8068026304244995, "reward_std": 0.14302782714366913, "rewards/accuracy_reward": 0.6661776304244995, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 257, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 550.296875, "epoch": 0.017755144174523434, "grad_norm": 1.1216066699630947, "kl": 0.130859375, "learning_rate": 9.992223654386932e-07, "loss": -0.0, "reward": 1.8475613594055176, "reward_std": 0.2007874846458435, "rewards/accuracy_reward": 0.7006863951683044, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 258, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.375, "completion_length": 585.25, "epoch": 0.01782396256279678, "grad_norm": 1.1124696353955013, "kl": 0.126953125, "learning_rate": 9.992163271595405e-07, "loss": 0.0, "reward": 1.3266775608062744, "reward_std": 0.3200254440307617, "rewards/accuracy_reward": 0.33292752504348755, "rewards/format_reward": 0.90625, "rewards/transform_reward": 0.0, "step": 259, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 483.203125, "epoch": 0.017892780951070126, "grad_norm": 1.4705751566246779, "kl": 0.166015625, "learning_rate": 9.9921026554594e-07, "loss": 0.0, "reward": 2.512500047683716, "reward_std": 0.20010189712047577, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 260, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 471.171875, "epoch": 0.01796159933934347, "grad_norm": 2.750834053348287, "kl": 0.181640625, "learning_rate": 9.992041805981757e-07, "loss": 0.0, "reward": 2.475607395172119, "reward_std": 0.23669742047786713, "rewards/accuracy_reward": 0.8037325739860535, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 261, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 508.09375, "epoch": 0.01803041772761682, "grad_norm": 0.8574508429445842, "kl": 0.18359375, "learning_rate": 9.991980723165319e-07, "loss": -0.0, "reward": 1.771668553352356, "reward_std": 0.1790010631084442, "rewards/accuracy_reward": 0.623231053352356, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0078125, "step": 262, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 494.265625, "epoch": 0.018099236115890167, "grad_norm": 0.9247475029505395, "kl": 0.1650390625, "learning_rate": 9.991919407012938e-07, "loss": -0.0, "reward": 2.4375, "reward_std": 0.2297932654619217, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 263, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 510.796875, "epoch": 0.018168054504163512, "grad_norm": 2.4209371757918894, "kl": 0.162109375, "learning_rate": 9.991857857527485e-07, "loss": 0.0, "reward": 2.340625047683716, "reward_std": 0.40209275484085083, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 264, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 514.0625, "epoch": 0.018236872892436858, "grad_norm": 1.0355280832612803, "kl": 0.1513671875, "learning_rate": 9.991796074711834e-07, "loss": 0.0, "reward": 2.4601221084594727, "reward_std": 0.2592206597328186, "rewards/accuracy_reward": 0.7851221561431885, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 265, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 477.1875, "epoch": 0.018305691280710207, "grad_norm": 1.38037633627958, "kl": 0.1884765625, "learning_rate": 9.991734058568872e-07, "loss": 0.0, "reward": 2.0687499046325684, "reward_std": 0.2082977592945099, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 266, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 513.109375, "epoch": 0.018374509668983553, "grad_norm": 2.7219360695499994, "kl": 0.1513671875, "learning_rate": 9.9916718091015e-07, "loss": 0.0, "reward": 2.330864667892456, "reward_std": 0.1473025232553482, "rewards/accuracy_reward": 0.705864667892456, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.46875, "step": 267, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 504.796875, "epoch": 0.0184433280572569, "grad_norm": 1.2759110214141862, "kl": 0.1611328125, "learning_rate": 9.991609326312628e-07, "loss": 0.0, "reward": 2.512500047683716, "reward_std": 0.20830953121185303, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 268, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 482.3125, "epoch": 0.018512146445530245, "grad_norm": 0.8839940670807945, "kl": 0.158203125, "learning_rate": 9.991546610205175e-07, "loss": 0.0, "reward": 2.1109375953674316, "reward_std": 0.17603811621665955, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3828125, "step": 269, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 492.0625, "epoch": 0.018580964833803594, "grad_norm": 2.714298631692555, "kl": 0.189453125, "learning_rate": 9.991483660782074e-07, "loss": 0.0, "reward": 2.5584793090820312, "reward_std": 0.1328749656677246, "rewards/accuracy_reward": 0.8803544044494629, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.484375, "step": 270, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 468.84375, "epoch": 0.01864978322207694, "grad_norm": 1.1616260356272634, "kl": 0.1728515625, "learning_rate": 9.991420478046265e-07, "loss": 0.0, "reward": 2.1836111545562744, "reward_std": 0.06488646566867828, "rewards/accuracy_reward": 0.6617360711097717, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 271, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 554.546875, "epoch": 0.018718601610350286, "grad_norm": 1.177168176381073, "kl": 0.142578125, "learning_rate": 9.9913570620007e-07, "loss": 0.0, "reward": 1.9765686988830566, "reward_std": 0.23834960162639618, "rewards/accuracy_reward": 0.7968810200691223, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0078125, "step": 272, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.0, "completion_length": 478.328125, "epoch": 0.01878741999862363, "grad_norm": 1.435658043485624, "kl": 0.193359375, "learning_rate": 9.99129341264835e-07, "loss": 0.0, "reward": 1.7823121547698975, "reward_std": 0.45723116397857666, "rewards/accuracy_reward": 0.6260620951652527, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.015625, "step": 273, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 548.28125, "epoch": 0.01885623838689698, "grad_norm": 2.4302087900014295, "kl": 0.142578125, "learning_rate": 9.991229529992184e-07, "loss": 0.0, "reward": 2.153125047683716, "reward_std": 0.26010584831237793, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.140625, "step": 274, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 476.21875, "epoch": 0.018925056775170326, "grad_norm": 0.9057739368202568, "kl": 0.19140625, "learning_rate": 9.991165414035192e-07, "loss": -0.0, "reward": 1.8062500953674316, "reward_std": 0.22724726796150208, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 275, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 546.625, "epoch": 0.018993875163443672, "grad_norm": 4.054783131191326, "kl": 0.15234375, "learning_rate": 9.991101064780367e-07, "loss": 0.0, "reward": 1.8564486503601074, "reward_std": 0.3713592290878296, "rewards/accuracy_reward": 0.6251987814903259, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.09375, "step": 276, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 452.5, "epoch": 0.019062693551717018, "grad_norm": 2.488553092573374, "kl": 0.181640625, "learning_rate": 9.991036482230718e-07, "loss": 0.0, "reward": 2.3414573669433594, "reward_std": 0.24026963114738464, "rewards/accuracy_reward": 0.7258324027061462, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.46875, "step": 277, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 451.203125, "epoch": 0.019131511939990367, "grad_norm": 1.2315469869275908, "kl": 0.2001953125, "learning_rate": 9.990971666389265e-07, "loss": 0.0, "reward": 2.2015624046325684, "reward_std": 0.24220015108585358, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3203125, "step": 278, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 465.328125, "epoch": 0.019200330328263713, "grad_norm": 0.8859978018904046, "kl": 0.18359375, "learning_rate": 9.990906617259038e-07, "loss": -0.0, "reward": 2.578125, "reward_std": 0.13462227582931519, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.453125, "step": 279, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 482.765625, "epoch": 0.01926914871653706, "grad_norm": 1.8297509819526272, "kl": 0.1708984375, "learning_rate": 9.990841334843076e-07, "loss": 0.0, "reward": 1.9156250953674316, "reward_std": 0.28092557191848755, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 280, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 484.921875, "epoch": 0.019337967104810404, "grad_norm": 1.072828406986399, "kl": 0.1796875, "learning_rate": 9.99077581914443e-07, "loss": 0.0, "reward": 2.388404130935669, "reward_std": 0.1283886432647705, "rewards/accuracy_reward": 0.7290289998054504, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 281, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 517.8125, "epoch": 0.019406785493083754, "grad_norm": 0.7345227360236376, "kl": 0.1591796875, "learning_rate": 9.990710070166165e-07, "loss": 0.0, "reward": 1.945312738418579, "reward_std": 0.20736484229564667, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0078125, "step": 282, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 518.9375, "epoch": 0.0194756038813571, "grad_norm": 0.9618664696600137, "kl": 0.1611328125, "learning_rate": 9.990644087911353e-07, "loss": 0.0, "reward": 1.881250023841858, "reward_std": 0.22724725306034088, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 283, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 537.859375, "epoch": 0.019544422269630445, "grad_norm": 1.727300245141655, "kl": 0.1591796875, "learning_rate": 9.990577872383076e-07, "loss": -0.0, "reward": 2.1578125953674316, "reward_std": 0.3640297055244446, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1640625, "step": 284, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 518.1875, "epoch": 0.01961324065790379, "grad_norm": 2.2013846635307583, "kl": 0.150390625, "learning_rate": 9.99051142358443e-07, "loss": -0.0, "reward": 2.176736831665039, "reward_std": 0.28602689504623413, "rewards/accuracy_reward": 0.7939242720603943, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.2265625, "step": 285, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 472.21875, "epoch": 0.01968205904617714, "grad_norm": 0.7772861254062274, "kl": 0.18359375, "learning_rate": 9.990444741518525e-07, "loss": 0.0, "reward": 2.0969724655151367, "reward_std": 0.15955610573291779, "rewards/accuracy_reward": 0.5922849178314209, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3671875, "step": 286, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 533.875, "epoch": 0.019750877434450486, "grad_norm": 1.510665194018513, "kl": 0.1552734375, "learning_rate": 9.990377826188473e-07, "loss": -0.0, "reward": 2.0484375953674316, "reward_std": 0.2466607242822647, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1328125, "step": 287, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 468.84375, "epoch": 0.019819695822723832, "grad_norm": 0.43044009375342657, "kl": 0.181640625, "learning_rate": 9.990310677597403e-07, "loss": 0.0, "reward": 2.3812499046325684, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 288, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 515.296875, "epoch": 0.019888514210997178, "grad_norm": 4.481433499973385, "kl": 0.15625, "learning_rate": 9.990243295748454e-07, "loss": 0.0, "reward": 2.2695984840393066, "reward_std": 0.25770071148872375, "rewards/accuracy_reward": 0.9008485674858093, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.171875, "step": 289, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 463.765625, "epoch": 0.019957332599270527, "grad_norm": 1.0406451973088575, "kl": 0.1806640625, "learning_rate": 9.990175680644777e-07, "loss": 0.0, "reward": 2.0114574432373047, "reward_std": 0.20874668657779694, "rewards/accuracy_reward": 0.8364574909210205, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 290, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 461.46875, "epoch": 0.020026150987543873, "grad_norm": 1.0062185134237174, "kl": 0.2080078125, "learning_rate": 9.990107832289531e-07, "loss": 0.0, "reward": 1.9562499523162842, "reward_std": 0.1332113891839981, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 291, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 485.984375, "epoch": 0.02009496937581722, "grad_norm": 0.8329446019980097, "kl": 0.1787109375, "learning_rate": 9.990039750685885e-07, "loss": 0.0, "reward": 2.0687499046325684, "reward_std": 0.12246951460838318, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 292, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 467.578125, "epoch": 0.020163787764090564, "grad_norm": 0.6396212292030826, "kl": 0.1845703125, "learning_rate": 9.989971435837027e-07, "loss": 0.0, "reward": 2.028125047683716, "reward_std": 0.11363068222999573, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 293, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 501.015625, "epoch": 0.020232606152363913, "grad_norm": 1.2377648510063193, "kl": 0.1748046875, "learning_rate": 9.989902887746146e-07, "loss": -0.0, "reward": 2.4708547592163086, "reward_std": 0.21375179290771484, "rewards/accuracy_reward": 0.8083546757698059, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 294, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 522.640625, "epoch": 0.02030142454063726, "grad_norm": 4.796454103061044, "kl": 0.15625, "learning_rate": 9.989834106416447e-07, "loss": 0.0, "reward": 2.1703126430511475, "reward_std": 0.4965924620628357, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1953125, "step": 295, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 553.609375, "epoch": 0.020370242928910605, "grad_norm": 1.4546870777424348, "kl": 0.158203125, "learning_rate": 9.989765091851143e-07, "loss": -0.0, "reward": 1.7440171241760254, "reward_std": 0.2823379933834076, "rewards/accuracy_reward": 0.576829731464386, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0546875, "step": 296, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 474.046875, "epoch": 0.02043906131718395, "grad_norm": 2.413400200494443, "kl": 0.1923828125, "learning_rate": 9.989695844053464e-07, "loss": 0.0, "reward": 1.848694086074829, "reward_std": 0.20257353782653809, "rewards/accuracy_reward": 0.6924439668655396, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 297, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 534.5625, "epoch": 0.0205078797054573, "grad_norm": 3.0325845746866706, "kl": 0.177734375, "learning_rate": 9.989626363026644e-07, "loss": 0.0, "reward": 2.2171874046325684, "reward_std": 0.19359153509140015, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0546875, "step": 298, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 480.15625, "epoch": 0.020576698093730646, "grad_norm": 0.49822868955915434, "kl": 0.189453125, "learning_rate": 9.989556648773933e-07, "loss": 0.0, "reward": 2.6152901649475098, "reward_std": 0.0035270885564386845, "rewards/accuracy_reward": 0.9152899980545044, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 299, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 502.921875, "epoch": 0.02064551648200399, "grad_norm": 0.8837219127489035, "kl": 0.185546875, "learning_rate": 9.989486701298586e-07, "loss": 0.0, "reward": 1.6502234935760498, "reward_std": 0.1767524629831314, "rewards/accuracy_reward": 0.5314735174179077, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 300, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 466.953125, "epoch": 0.020714334870277337, "grad_norm": 0.7955494497715209, "kl": 0.2138671875, "learning_rate": 9.989416520603877e-07, "loss": 0.0, "reward": 1.8781250715255737, "reward_std": 0.11363068222999573, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 301, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 519.125, "epoch": 0.020783153258550686, "grad_norm": 2.9259135915003287, "kl": 0.1630859375, "learning_rate": 9.989346106693081e-07, "loss": 0.0, "reward": 2.2249999046325684, "reward_std": 0.19559556245803833, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0625, "step": 302, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 473.03125, "epoch": 0.020851971646824032, "grad_norm": 1.1588527632865553, "kl": 0.2119140625, "learning_rate": 9.989275459569496e-07, "loss": 0.0, "reward": 1.782794713973999, "reward_std": 0.20481131970882416, "rewards/accuracy_reward": 0.6218571066856384, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0078125, "step": 303, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 467.53125, "epoch": 0.020920790035097378, "grad_norm": 2.1727145913601422, "kl": 0.1953125, "learning_rate": 9.989204579236416e-07, "loss": 0.0, "reward": 2.453125, "reward_std": 0.1831696480512619, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 304, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 452.4375, "epoch": 0.020989608423370724, "grad_norm": 1.7081679343707046, "kl": 0.208984375, "learning_rate": 9.98913346569716e-07, "loss": 0.0, "reward": 1.993749976158142, "reward_std": 0.15782484412193298, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.265625, "step": 305, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 469.4375, "epoch": 0.021058426811644073, "grad_norm": 4.2005369844777025, "kl": 0.2001953125, "learning_rate": 9.98906211895505e-07, "loss": 0.0, "reward": 2.6812500953674316, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 306, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 488.859375, "epoch": 0.02112724519991742, "grad_norm": 2.5173611312234048, "kl": 0.1748046875, "learning_rate": 9.988990539013425e-07, "loss": 0.0, "reward": 2.2359375953674316, "reward_std": 0.29903644323349, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1484375, "step": 307, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 467.1875, "epoch": 0.021196063588190765, "grad_norm": 2.1698533258197785, "kl": 0.2236328125, "learning_rate": 9.988918725875624e-07, "loss": 0.0, "reward": 2.511622905731201, "reward_std": 0.16078609228134155, "rewards/accuracy_reward": 0.8381854295730591, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 308, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 469.46875, "epoch": 0.02126488197646411, "grad_norm": 0.9781393874861872, "kl": 0.2080078125, "learning_rate": 9.988846679545007e-07, "loss": 0.0, "reward": 2.4824740886688232, "reward_std": 0.14691263437271118, "rewards/accuracy_reward": 0.8652864694595337, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4296875, "step": 309, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 512.40625, "epoch": 0.02133370036473746, "grad_norm": 1.97250895781121, "kl": 0.1708984375, "learning_rate": 9.988774400024943e-07, "loss": 0.0, "reward": 2.1953125, "reward_std": 0.3948599100112915, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.1015625, "step": 310, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 529.53125, "epoch": 0.021402518753010805, "grad_norm": 1.7491394759503576, "kl": 0.17578125, "learning_rate": 9.988701887318809e-07, "loss": -0.0, "reward": 2.0250000953674316, "reward_std": 0.27587074041366577, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.03125, "step": 311, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 480.4375, "epoch": 0.02147133714128415, "grad_norm": 1.1558024424930726, "kl": 0.205078125, "learning_rate": 9.98862914142999e-07, "loss": 0.0, "reward": 1.9343750476837158, "reward_std": 0.25249195098876953, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 312, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 500.5625, "epoch": 0.021540155529557497, "grad_norm": 0.6420517875865306, "kl": 0.201171875, "learning_rate": 9.988556162361896e-07, "loss": -0.0, "reward": 2.487192153930664, "reward_std": 0.048436202108860016, "rewards/accuracy_reward": 0.8121921420097351, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 313, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 483.625, "epoch": 0.021608973917830846, "grad_norm": 2.411411519194656, "kl": 0.203125, "learning_rate": 9.98848295011793e-07, "loss": 0.0, "reward": 2.42512583732605, "reward_std": 0.17221692204475403, "rewards/accuracy_reward": 0.753250777721405, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 314, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 471.375, "epoch": 0.021677792306104192, "grad_norm": 1.0634411340616978, "kl": 0.234375, "learning_rate": 9.98840950470152e-07, "loss": -0.0, "reward": 2.3781251907348633, "reward_std": 0.19945892691612244, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 315, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 502.65625, "epoch": 0.021746610694377538, "grad_norm": 1.7282986115544559, "kl": 0.189453125, "learning_rate": 9.988335826116094e-07, "loss": 0.0, "reward": 2.0374999046325684, "reward_std": 0.29139184951782227, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.140625, "step": 316, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 483.0, "epoch": 0.021815429082650883, "grad_norm": 8.552203082870282, "kl": 0.197265625, "learning_rate": 9.988261914365096e-07, "loss": 0.0, "reward": 2.0560760498046875, "reward_std": 0.27645331621170044, "rewards/accuracy_reward": 0.8388886451721191, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0390625, "step": 317, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 498.421875, "epoch": 0.021884247470924233, "grad_norm": 1.0823333431111595, "kl": 0.2001953125, "learning_rate": 9.988187769451988e-07, "loss": -0.0, "reward": 2.0390625, "reward_std": 0.1691659688949585, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0078125, "step": 318, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 509.59375, "epoch": 0.02195306585919758, "grad_norm": 1.5350211519993298, "kl": 0.197265625, "learning_rate": 9.988113391380226e-07, "loss": -0.0, "reward": 1.4719126224517822, "reward_std": 0.3694583773612976, "rewards/accuracy_reward": 0.3969125747680664, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 319, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 538.0625, "epoch": 0.022021884247470924, "grad_norm": 2.35614097821652, "kl": 0.18359375, "learning_rate": 9.988038780153291e-07, "loss": 0.0, "reward": 2.1562671661376953, "reward_std": 0.3170830309391022, "rewards/accuracy_reward": 0.839079737663269, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1328125, "step": 320, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 487.09375, "epoch": 0.02209070263574427, "grad_norm": 2.0149257935621807, "kl": 0.2451171875, "learning_rate": 9.987963935774673e-07, "loss": 0.0, "reward": 2.528125047683716, "reward_std": 0.11363068222999573, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 321, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 459.171875, "epoch": 0.02215952102401762, "grad_norm": 4.069788925091971, "kl": 0.263671875, "learning_rate": 9.987888858247866e-07, "loss": 0.0, "reward": 2.257807970046997, "reward_std": 0.2087532877922058, "rewards/accuracy_reward": 0.6171829104423523, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 322, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 522.09375, "epoch": 0.022228339412290965, "grad_norm": 1.7730351938221254, "kl": 0.22265625, "learning_rate": 9.987813547576381e-07, "loss": -0.0, "reward": 2.27339506149292, "reward_std": 0.2148590087890625, "rewards/accuracy_reward": 0.6452699303627014, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 323, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 537.625, "epoch": 0.02229715780056431, "grad_norm": 2.6229796142235138, "kl": 0.18359375, "learning_rate": 9.98773800376374e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.20010189712047577, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 324, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 516.234375, "epoch": 0.022365976188837657, "grad_norm": 0.994615521080317, "kl": 0.2392578125, "learning_rate": 9.98766222681347e-07, "loss": 0.0, "reward": 2.008578300476074, "reward_std": 0.14214403927326202, "rewards/accuracy_reward": 0.8179532885551453, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 325, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 472.015625, "epoch": 0.022434794577111006, "grad_norm": 1.2953569682183466, "kl": 0.26171875, "learning_rate": 9.987586216729118e-07, "loss": 0.0, "reward": 1.858943223953247, "reward_std": 0.06049930304288864, "rewards/accuracy_reward": 0.7089431285858154, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 326, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 534.5625, "epoch": 0.02250361296538435, "grad_norm": 1.6870097129866612, "kl": 0.1923828125, "learning_rate": 9.987509973514233e-07, "loss": 0.0, "reward": 2.3140625953674316, "reward_std": 0.21856677532196045, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1328125, "step": 327, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 461.921875, "epoch": 0.022572431353657697, "grad_norm": 0.588153078993171, "kl": 0.26171875, "learning_rate": 9.98743349717238e-07, "loss": 0.0, "reward": 2.2906250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 328, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 482.015625, "epoch": 0.022641249741931043, "grad_norm": 1.5983092427898606, "kl": 0.24609375, "learning_rate": 9.987356787707133e-07, "loss": -0.0, "reward": 2.124223232269287, "reward_std": 0.15145596861839294, "rewards/accuracy_reward": 0.7023483514785767, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.265625, "step": 329, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 480.484375, "epoch": 0.022710068130204392, "grad_norm": 1.1629291800057429, "kl": 0.2470703125, "learning_rate": 9.98727984512208e-07, "loss": -0.0, "reward": 1.9722182750701904, "reward_std": 0.2997076213359833, "rewards/accuracy_reward": 0.5409682989120483, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.296875, "step": 330, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 475.453125, "epoch": 0.022778886518477738, "grad_norm": 1.2269312388301978, "kl": 0.259765625, "learning_rate": 9.987202669420814e-07, "loss": -0.0, "reward": 2.4364047050476074, "reward_std": 0.2915830612182617, "rewards/accuracy_reward": 0.8020297288894653, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.46875, "step": 331, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 489.734375, "epoch": 0.022847704906751084, "grad_norm": 0.7570720027454264, "kl": 0.244140625, "learning_rate": 9.987125260606945e-07, "loss": -0.0, "reward": 2.5947558879852295, "reward_std": 0.08077511936426163, "rewards/accuracy_reward": 0.9010058045387268, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 332, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 477.6875, "epoch": 0.02291652329502443, "grad_norm": 1.3667937187671437, "kl": 0.263671875, "learning_rate": 9.98704761868409e-07, "loss": -0.0, "reward": 2.1078124046325684, "reward_std": 0.2467678040266037, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3984375, "step": 333, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 516.375, "epoch": 0.02298534168329778, "grad_norm": 1.8497921253756544, "kl": 0.2177734375, "learning_rate": 9.98696974365588e-07, "loss": 0.0, "reward": 2.09375, "reward_std": 0.18402281403541565, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0625, "step": 334, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 523.25, "epoch": 0.023054160071571125, "grad_norm": 1.2780231404374272, "kl": 0.2412109375, "learning_rate": 9.986891635525952e-07, "loss": 0.0, "reward": 2.289086103439331, "reward_std": 0.27276891469955444, "rewards/accuracy_reward": 0.6390860080718994, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 335, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 489.265625, "epoch": 0.02312297845984447, "grad_norm": 1.6524572720411412, "kl": 0.26953125, "learning_rate": 9.986813294297957e-07, "loss": -0.0, "reward": 2.3468751907348633, "reward_std": 0.25410783290863037, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.390625, "step": 336, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 513.421875, "epoch": 0.023191796848117816, "grad_norm": 1.8292836017740073, "kl": 0.259765625, "learning_rate": 9.98673471997556e-07, "loss": -0.0, "reward": 2.4302358627319336, "reward_std": 0.1531294882297516, "rewards/accuracy_reward": 0.7646106481552124, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 337, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 578.109375, "epoch": 0.023260615236391165, "grad_norm": 1.0616973054625696, "kl": 0.2119140625, "learning_rate": 9.986655912562434e-07, "loss": -0.0, "reward": 1.9255566596984863, "reward_std": 0.21573558449745178, "rewards/accuracy_reward": 0.3755566477775574, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.5, "step": 338, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 526.78125, "epoch": 0.02332943362466451, "grad_norm": 1.3556769257365653, "kl": 0.2490234375, "learning_rate": 9.986576872062258e-07, "loss": 0.0, "reward": 1.9563369750976562, "reward_std": 0.2905852198600769, "rewards/accuracy_reward": 0.511024534702301, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.3359375, "step": 339, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 516.484375, "epoch": 0.023398252012937857, "grad_norm": 0.8289250530063857, "kl": 0.259765625, "learning_rate": 9.98649759847873e-07, "loss": -0.0, "reward": 2.5875000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 340, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 486.53125, "epoch": 0.023467070401211203, "grad_norm": 0.6557234560222438, "kl": 0.28515625, "learning_rate": 9.986418091815556e-07, "loss": 0.0, "reward": 2.125, "reward_std": 0.1306653916835785, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 341, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 513.359375, "epoch": 0.023535888789484552, "grad_norm": 1.0645189485588493, "kl": 0.2578125, "learning_rate": 9.98633835207645e-07, "loss": -0.0, "reward": 2.0825629234313965, "reward_std": 0.1643538773059845, "rewards/accuracy_reward": 0.476313054561615, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 342, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 505.28125, "epoch": 0.023604707177757898, "grad_norm": 1.425778519007768, "kl": 0.232421875, "learning_rate": 9.986258379265137e-07, "loss": -0.0, "reward": 1.8406250476837158, "reward_std": 0.12437255680561066, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 343, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 517.0625, "epoch": 0.023673525566031244, "grad_norm": 1.0232099778336063, "kl": 0.26171875, "learning_rate": 9.986178173385362e-07, "loss": 0.0, "reward": 2.3015799522399902, "reward_std": 0.2507856488227844, "rewards/accuracy_reward": 0.6750175356864929, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 344, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 479.421875, "epoch": 0.02374234395430459, "grad_norm": 2.282856967969313, "kl": 0.296875, "learning_rate": 9.98609773444087e-07, "loss": 0.0, "reward": 2.4274966716766357, "reward_std": 0.22321683168411255, "rewards/accuracy_reward": 0.8353091478347778, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4140625, "step": 345, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 500.421875, "epoch": 0.02381116234257794, "grad_norm": 0.7143012403242295, "kl": 0.296875, "learning_rate": 9.98601706243542e-07, "loss": 0.0, "reward": 2.0625, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 346, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 519.09375, "epoch": 0.023879980730851284, "grad_norm": 1.6035454458959986, "kl": 0.232421875, "learning_rate": 9.985936157372783e-07, "loss": 0.0, "reward": 2.2453689575195312, "reward_std": 0.3281557559967041, "rewards/accuracy_reward": 0.8656812906265259, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.2421875, "step": 347, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 488.109375, "epoch": 0.02394879911912463, "grad_norm": 0.9200544501331249, "kl": 0.26953125, "learning_rate": 9.985855019256743e-07, "loss": -0.0, "reward": 2.512500047683716, "reward_std": 0.15526476502418518, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 348, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 491.484375, "epoch": 0.024017617507397976, "grad_norm": 2.8746238116460034, "kl": 0.2578125, "learning_rate": 9.98577364809109e-07, "loss": 0.0, "reward": 2.5491983890533447, "reward_std": 0.1354280263185501, "rewards/accuracy_reward": 0.8616985082626343, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 349, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 518.359375, "epoch": 0.024086435895671325, "grad_norm": 1.8758672627299764, "kl": 0.2314453125, "learning_rate": 9.985692043879633e-07, "loss": 0.0, "reward": 1.9027024507522583, "reward_std": 0.2909400761127472, "rewards/accuracy_reward": 0.7511398792266846, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.0390625, "step": 350, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 514.171875, "epoch": 0.02415525428394467, "grad_norm": 2.420456002571959, "kl": 0.25, "learning_rate": 9.98561020662618e-07, "loss": -0.0, "reward": 2.0578126907348633, "reward_std": 0.3992385268211365, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1015625, "step": 351, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 529.5, "epoch": 0.024224072672218017, "grad_norm": 0.7466245265070705, "kl": 0.2353515625, "learning_rate": 9.985528136334554e-07, "loss": 0.0, "reward": 1.7136660814285278, "reward_std": 0.07400207966566086, "rewards/accuracy_reward": 0.5824159383773804, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 352, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 526.59375, "epoch": 0.024292891060491362, "grad_norm": 1.1015491298085738, "kl": 0.2470703125, "learning_rate": 9.985445833008599e-07, "loss": 0.0, "reward": 1.8718750476837158, "reward_std": 0.16740429401397705, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.046875, "step": 353, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 474.234375, "epoch": 0.02436170944876471, "grad_norm": 1.0534639421474787, "kl": 0.2578125, "learning_rate": 9.985363296652156e-07, "loss": -0.0, "reward": 1.811901569366455, "reward_std": 0.0842132419347763, "rewards/accuracy_reward": 0.658776581287384, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 354, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 456.453125, "epoch": 0.024430527837038057, "grad_norm": 1.166732339490346, "kl": 0.3359375, "learning_rate": 9.985280527269088e-07, "loss": -0.0, "reward": 2.2406249046325684, "reward_std": 0.18223629891872406, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.25, "step": 355, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 509.9375, "epoch": 0.024499346225311403, "grad_norm": 1.2450634051937957, "kl": 0.2421875, "learning_rate": 9.98519752486326e-07, "loss": 0.0, "reward": 2.064201593399048, "reward_std": 0.1367357075214386, "rewards/accuracy_reward": 0.8735765218734741, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 356, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 486.65625, "epoch": 0.02456816461358475, "grad_norm": 0.8442518810611268, "kl": 0.314453125, "learning_rate": 9.985114289438554e-07, "loss": -0.0, "reward": 2.0286848545074463, "reward_std": 0.14233563840389252, "rewards/accuracy_reward": 0.8568097949028015, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 357, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 514.953125, "epoch": 0.024636983001858098, "grad_norm": 1.894648147150313, "kl": 0.255859375, "learning_rate": 9.98503082099886e-07, "loss": 0.0, "reward": 1.7254235744476318, "reward_std": 0.2193187028169632, "rewards/accuracy_reward": 0.591048538684845, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 358, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 516.78125, "epoch": 0.024705801390131444, "grad_norm": 1.2450554981210982, "kl": 0.263671875, "learning_rate": 9.984947119548075e-07, "loss": -0.0, "reward": 2.4501800537109375, "reward_std": 0.11882889270782471, "rewards/accuracy_reward": 0.7814302444458008, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 359, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 495.515625, "epoch": 0.02477461977840479, "grad_norm": 0.9516261635117981, "kl": 0.314453125, "learning_rate": 9.98486318509012e-07, "loss": -0.0, "reward": 2.1281251907348633, "reward_std": 0.12437255680561066, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 360, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 527.40625, "epoch": 0.024843438166678136, "grad_norm": 1.2177382630031546, "kl": 0.263671875, "learning_rate": 9.98477901762891e-07, "loss": 0.0, "reward": 2.079021453857422, "reward_std": 0.15793518722057343, "rewards/accuracy_reward": 0.855583906173706, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0390625, "step": 361, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 500.640625, "epoch": 0.024912256554951485, "grad_norm": 1.2307869234946422, "kl": 0.27734375, "learning_rate": 9.984694617168386e-07, "loss": 0.0, "reward": 2.453125, "reward_std": 0.17485956847667694, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 362, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 522.5, "epoch": 0.02498107494322483, "grad_norm": 5.025757371647956, "kl": 0.28515625, "learning_rate": 9.984609983712486e-07, "loss": -0.0, "reward": 2.287703037261963, "reward_std": 0.2916364371776581, "rewards/accuracy_reward": 0.6439529657363892, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 363, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 504.421875, "epoch": 0.025049893331498176, "grad_norm": 2.5311988109502437, "kl": 0.275390625, "learning_rate": 9.984525117265172e-07, "loss": 0.0, "reward": 2.3921875953674316, "reward_std": 0.3249186873435974, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.2265625, "step": 364, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 485.78125, "epoch": 0.025118711719771522, "grad_norm": 1.4935843635202586, "kl": 0.3203125, "learning_rate": 9.98444001783041e-07, "loss": -0.0, "reward": 1.9734375476837158, "reward_std": 0.3477127254009247, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.2421875, "step": 365, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 538.015625, "epoch": 0.02518753010804487, "grad_norm": 4.0727991141329944, "kl": 0.2578125, "learning_rate": 9.984354685412176e-07, "loss": 0.0, "reward": 2.1624999046325684, "reward_std": 0.36069148778915405, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.265625, "step": 366, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 497.21875, "epoch": 0.025256348496318217, "grad_norm": 1.9291108970790256, "kl": 0.29296875, "learning_rate": 9.984269120014457e-07, "loss": 0.0, "reward": 2.294142246246338, "reward_std": 0.17053580284118652, "rewards/accuracy_reward": 0.8144547343254089, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3046875, "step": 367, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 464.59375, "epoch": 0.025325166884591563, "grad_norm": 1.0108007867558249, "kl": 0.333984375, "learning_rate": 9.984183321641256e-07, "loss": -0.0, "reward": 2.418750047683716, "reward_std": 0.13759177923202515, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3125, "step": 368, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 497.40625, "epoch": 0.02539398527286491, "grad_norm": 0.5946560780622739, "kl": 0.265625, "learning_rate": 9.98409729029658e-07, "loss": 0.0, "reward": 1.9562500715255737, "reward_std": 0.1332113891839981, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 369, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 540.359375, "epoch": 0.025462803661138258, "grad_norm": 0.5479039764103921, "kl": 0.265625, "learning_rate": 9.984011025984455e-07, "loss": 0.0, "reward": 1.8570291996002197, "reward_std": 0.16846522688865662, "rewards/accuracy_reward": 0.7445290684700012, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.0, "step": 370, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 473.34375, "epoch": 0.025531622049411604, "grad_norm": 0.4692992629676459, "kl": 0.32421875, "learning_rate": 9.983924528708909e-07, "loss": 0.0, "reward": 1.915624976158142, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 371, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 545.46875, "epoch": 0.02560044043768495, "grad_norm": 1.190231758411736, "kl": 0.279296875, "learning_rate": 9.983837798473986e-07, "loss": 0.0, "reward": 1.7045862674713135, "reward_std": 0.05498567223548889, "rewards/accuracy_reward": 0.5561486482620239, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0234375, "step": 372, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 479.265625, "epoch": 0.025669258825958295, "grad_norm": 0.8480554764197619, "kl": 0.27734375, "learning_rate": 9.98375083528374e-07, "loss": -0.0, "reward": 1.975000023841858, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 373, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 568.5, "epoch": 0.025738077214231644, "grad_norm": 1.3466717943377482, "kl": 0.259765625, "learning_rate": 9.983663639142237e-07, "loss": 0.0, "reward": 2.149416923522949, "reward_std": 0.3250711262226105, "rewards/accuracy_reward": 0.6072294116020203, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.4296875, "step": 374, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 498.25, "epoch": 0.02580689560250499, "grad_norm": 0.7746440357043038, "kl": 0.27734375, "learning_rate": 9.983576210053553e-07, "loss": -0.0, "reward": 2.409900188446045, "reward_std": 0.14369407296180725, "rewards/accuracy_reward": 0.7474000453948975, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 375, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 518.8125, "epoch": 0.025875713990778336, "grad_norm": 1.7421550842802838, "kl": 0.26953125, "learning_rate": 9.983488548021773e-07, "loss": 0.0, "reward": 1.7030367851257324, "reward_std": 0.36181360483169556, "rewards/accuracy_reward": 0.5717867612838745, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 376, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 524.890625, "epoch": 0.02594453237905168, "grad_norm": 1.239287043023381, "kl": 0.279296875, "learning_rate": 9.983400653050996e-07, "loss": 0.0, "reward": 1.5948328971862793, "reward_std": 0.2792920470237732, "rewards/accuracy_reward": 0.47452032566070557, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.0390625, "step": 377, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 479.828125, "epoch": 0.02601335076732503, "grad_norm": 1.284653217821305, "kl": 0.31640625, "learning_rate": 9.983312525145328e-07, "loss": -0.0, "reward": 2.301292896270752, "reward_std": 0.11309510469436646, "rewards/accuracy_reward": 0.6434803605079651, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 378, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 519.0, "epoch": 0.026082169155598377, "grad_norm": 1.9472633589607695, "kl": 0.275390625, "learning_rate": 9.98322416430889e-07, "loss": 0.0, "reward": 1.9453125, "reward_std": 0.21665312349796295, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0234375, "step": 379, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 499.71875, "epoch": 0.026150987543871723, "grad_norm": 0.9390934860734661, "kl": 0.2578125, "learning_rate": 9.983135570545813e-07, "loss": 0.0, "reward": 1.8218750953674316, "reward_std": 0.12182654440402985, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 380, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 455.28125, "epoch": 0.02621980593214507, "grad_norm": 2.2963480131160767, "kl": 0.322265625, "learning_rate": 9.983046743860235e-07, "loss": -0.0, "reward": 1.9343750476837158, "reward_std": 0.20692957937717438, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 381, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 525.8125, "epoch": 0.026288624320418418, "grad_norm": 2.6630443547293825, "kl": 0.26953125, "learning_rate": 9.982957684256312e-07, "loss": 0.0, "reward": 2.5218505859375, "reward_std": 0.1732160747051239, "rewards/accuracy_reward": 0.8452879786491394, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 382, "temperature": 1.0 }, { "all_correct": 0.125, "all_wrong": 0.125, "completion_length": 505.78125, "epoch": 0.026357442708691763, "grad_norm": 1.4294179302804366, "kl": 0.29296875, "learning_rate": 9.982868391738203e-07, "loss": 0.0, "reward": 1.6604962348937988, "reward_std": 0.3393540680408478, "rewards/accuracy_reward": 0.5448711514472961, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 383, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 484.46875, "epoch": 0.02642626109696511, "grad_norm": 0.7233923359578932, "kl": 0.298828125, "learning_rate": 9.982778866310087e-07, "loss": -0.0, "reward": 2.0734376907348633, "reward_std": 0.16398736834526062, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0078125, "step": 384, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 472.796875, "epoch": 0.026495079485238455, "grad_norm": 0.8266920650221883, "kl": 0.2734375, "learning_rate": 9.982689107976142e-07, "loss": 0.0, "reward": 2.384366035461426, "reward_std": 0.09822171926498413, "rewards/accuracy_reward": 0.7218658328056335, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 385, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 509.125, "epoch": 0.026563897873511804, "grad_norm": 0.7007674203102977, "kl": 0.265625, "learning_rate": 9.982599116740567e-07, "loss": 0.0, "reward": 2.543105125427246, "reward_std": 0.09107097238302231, "rewards/accuracy_reward": 0.861855149269104, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 386, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 475.546875, "epoch": 0.02663271626178515, "grad_norm": 1.4021470839799253, "kl": 0.326171875, "learning_rate": 9.98250889260757e-07, "loss": 0.0, "reward": 2.4749999046325684, "reward_std": 0.1306653916835785, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 387, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 493.640625, "epoch": 0.026701534650058496, "grad_norm": 1.0759596306062302, "kl": 0.271484375, "learning_rate": 9.982418435581364e-07, "loss": 0.0, "reward": 2.482422351837158, "reward_std": 0.09362315386533737, "rewards/accuracy_reward": 0.7949221134185791, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 388, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 509.75, "epoch": 0.02677035303833184, "grad_norm": 0.8686938672723303, "kl": 0.28125, "learning_rate": 9.982327745666182e-07, "loss": 0.0, "reward": 2.6812500953674316, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 389, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 499.546875, "epoch": 0.02683917142660519, "grad_norm": 0.7393013991701181, "kl": 0.267578125, "learning_rate": 9.98223682286626e-07, "loss": 0.0, "reward": 2.041287899017334, "reward_std": 0.15829402208328247, "rewards/accuracy_reward": 0.8631628751754761, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 390, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 517.390625, "epoch": 0.026907989814878536, "grad_norm": 0.7986479652507037, "kl": 0.283203125, "learning_rate": 9.982145667185846e-07, "loss": -0.0, "reward": 1.8796519041061401, "reward_std": 0.11694847792387009, "rewards/accuracy_reward": 0.7234018445014954, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 391, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 526.578125, "epoch": 0.026976808203151882, "grad_norm": 1.2979075874588981, "kl": 0.271484375, "learning_rate": 9.982054278629205e-07, "loss": 0.0, "reward": 2.130402088165283, "reward_std": 0.2474927008152008, "rewards/accuracy_reward": 0.669464647769928, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.3359375, "step": 392, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 465.9375, "epoch": 0.027045626591425228, "grad_norm": 0.6776165119581153, "kl": 0.27734375, "learning_rate": 9.981962657200607e-07, "loss": 0.0, "reward": 2.5281248092651367, "reward_std": 0.06187184900045395, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 393, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 485.796875, "epoch": 0.027114444979698577, "grad_norm": 0.6892036217018892, "kl": 0.279296875, "learning_rate": 9.981870802904334e-07, "loss": -0.0, "reward": 1.75, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 394, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 467.5625, "epoch": 0.027183263367971923, "grad_norm": 0.5955643284516627, "kl": 0.333984375, "learning_rate": 9.98177871574468e-07, "loss": 0.0, "reward": 1.8968751430511475, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 395, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 483.640625, "epoch": 0.02725208175624527, "grad_norm": 1.5581981268716476, "kl": 0.27734375, "learning_rate": 9.98168639572595e-07, "loss": 0.0, "reward": 2.4746484756469727, "reward_std": 0.061610378324985504, "rewards/accuracy_reward": 0.8027734160423279, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 396, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 494.625, "epoch": 0.027320900144518614, "grad_norm": 3.3946444637379125, "kl": 0.3046875, "learning_rate": 9.981593842852458e-07, "loss": -0.0, "reward": 2.223437547683716, "reward_std": 0.06629125773906708, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0234375, "step": 397, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 502.953125, "epoch": 0.027389718532791964, "grad_norm": 1.5128417555235703, "kl": 0.248046875, "learning_rate": 9.981501057128532e-07, "loss": 0.0, "reward": 1.9480741024017334, "reward_std": 0.014380339533090591, "rewards/accuracy_reward": 0.7730740308761597, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 398, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 459.8125, "epoch": 0.02745853692106531, "grad_norm": 0.73490097683417, "kl": 0.3359375, "learning_rate": 9.981408038558507e-07, "loss": 0.0, "reward": 2.3218750953674316, "reward_std": 0.0289318785071373, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.421875, "step": 399, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 446.015625, "epoch": 0.027527355309338655, "grad_norm": 1.5230031173356344, "kl": 0.287109375, "learning_rate": 9.981314787146731e-07, "loss": -0.0, "reward": 2.4944510459899902, "reward_std": 0.15864813327789307, "rewards/accuracy_reward": 0.8163260817527771, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 400, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 475.296875, "epoch": 0.027596173697612, "grad_norm": 1.3492992766504344, "kl": 0.267578125, "learning_rate": 9.981221302897565e-07, "loss": 0.0, "reward": 2.52606201171875, "reward_std": 0.17579221725463867, "rewards/accuracy_reward": 0.8416869640350342, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 401, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 492.75, "epoch": 0.02766499208588535, "grad_norm": 0.9455119215103797, "kl": 0.287109375, "learning_rate": 9.981127585815376e-07, "loss": -0.0, "reward": 1.7537789344787598, "reward_std": 0.08293481171131134, "rewards/accuracy_reward": 0.6100289225578308, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 402, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 468.28125, "epoch": 0.027733810474158696, "grad_norm": 1.027088473118049, "kl": 0.3125, "learning_rate": 9.981033635904546e-07, "loss": 0.0, "reward": 2.40454363822937, "reward_std": 0.13921807706356049, "rewards/accuracy_reward": 0.7482935786247253, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 403, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 491.890625, "epoch": 0.027802628862432042, "grad_norm": 1.9599238416374771, "kl": 0.296875, "learning_rate": 9.980939453169466e-07, "loss": 0.0, "reward": 2.193598747253418, "reward_std": 0.35541900992393494, "rewards/accuracy_reward": 0.6498486399650574, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.390625, "step": 404, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 460.5625, "epoch": 0.027871447250705388, "grad_norm": 1.1320898828343426, "kl": 0.31640625, "learning_rate": 9.980845037614536e-07, "loss": -0.0, "reward": 2.096874952316284, "reward_std": 0.1173202246427536, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.03125, "step": 405, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 473.421875, "epoch": 0.027940265638978737, "grad_norm": 1.830755739681469, "kl": 0.259765625, "learning_rate": 9.980750389244176e-07, "loss": -0.0, "reward": 2.3249998092651367, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 406, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 508.5, "epoch": 0.028009084027252083, "grad_norm": 0.8491381062347951, "kl": 0.3125, "learning_rate": 9.9806555080628e-07, "loss": 0.0, "reward": 2.0698907375335693, "reward_std": 0.04594181850552559, "rewards/accuracy_reward": 0.8620782494544983, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0078125, "step": 407, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 487.609375, "epoch": 0.02807790241552543, "grad_norm": 1.5392502476368448, "kl": 0.3203125, "learning_rate": 9.980560394074852e-07, "loss": 0.0, "reward": 2.4122049808502197, "reward_std": 0.16227544844150543, "rewards/accuracy_reward": 0.7997048497200012, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 408, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 482.09375, "epoch": 0.028146720803798774, "grad_norm": 1.0812005467300083, "kl": 0.265625, "learning_rate": 9.980465047284775e-07, "loss": 0.0, "reward": 1.768926978111267, "reward_std": 0.17235416173934937, "rewards/accuracy_reward": 0.6251770257949829, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 409, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 474.625, "epoch": 0.028215539192072123, "grad_norm": 0.7344121373038537, "kl": 0.263671875, "learning_rate": 9.980369467697024e-07, "loss": 0.0, "reward": 2.434375047683716, "reward_std": 0.16666369140148163, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 410, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 479.15625, "epoch": 0.02828435758034547, "grad_norm": 1.545341300447179, "kl": 0.28125, "learning_rate": 9.980273655316066e-07, "loss": 0.0, "reward": 2.1156249046325684, "reward_std": 0.183067187666893, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 411, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 464.859375, "epoch": 0.028353175968618815, "grad_norm": 0.9287561328929709, "kl": 0.30078125, "learning_rate": 9.980177610146386e-07, "loss": 0.0, "reward": 2.0875000953674316, "reward_std": 0.14961488544940948, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 412, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 489.296875, "epoch": 0.02842199435689216, "grad_norm": 0.6734773023924155, "kl": 0.328125, "learning_rate": 9.980081332192464e-07, "loss": 0.0, "reward": 2.0333242416381836, "reward_std": 0.051866110414266586, "rewards/accuracy_reward": 0.8583241701126099, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 413, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 499.296875, "epoch": 0.02849081274516551, "grad_norm": 1.6893716338767968, "kl": 0.318359375, "learning_rate": 9.979984821458807e-07, "loss": -0.0, "reward": 2.078125, "reward_std": 0.25836294889450073, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.40625, "step": 414, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 485.546875, "epoch": 0.028559631133438856, "grad_norm": 2.7884480023440004, "kl": 0.34375, "learning_rate": 9.979888077949923e-07, "loss": 0.0, "reward": 2.4281249046325684, "reward_std": 0.22903376817703247, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.453125, "step": 415, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 492.6875, "epoch": 0.0286284495217122, "grad_norm": 0.633495890681658, "kl": 0.3125, "learning_rate": 9.979791101670335e-07, "loss": 0.0, "reward": 2.1875, "reward_std": 0.07312604784965515, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.421875, "step": 416, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 493.25, "epoch": 0.028697267909985547, "grad_norm": 2.838277649501399, "kl": 0.279296875, "learning_rate": 9.979693892624574e-07, "loss": 0.0, "reward": 2.53125, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 417, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 494.3125, "epoch": 0.028766086298258896, "grad_norm": 1.2197006320034816, "kl": 0.31640625, "learning_rate": 9.979596450817186e-07, "loss": -0.0, "reward": 1.6921875476837158, "reward_std": 0.17017598450183868, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0546875, "step": 418, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 504.515625, "epoch": 0.028834904686532242, "grad_norm": 1.2451564785235598, "kl": 0.275390625, "learning_rate": 9.979498776252727e-07, "loss": 0.0, "reward": 1.6224989891052246, "reward_std": 0.07415206730365753, "rewards/accuracy_reward": 0.4912490248680115, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 419, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 511.40625, "epoch": 0.028903723074805588, "grad_norm": 0.8648752179749764, "kl": 0.318359375, "learning_rate": 9.97940086893576e-07, "loss": -0.0, "reward": 1.9486150741577148, "reward_std": 0.11010819673538208, "rewards/accuracy_reward": 0.7783023715019226, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0078125, "step": 420, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 473.828125, "epoch": 0.028972541463078934, "grad_norm": 0.639865463411648, "kl": 0.283203125, "learning_rate": 9.979302728870863e-07, "loss": -0.0, "reward": 2.418750047683716, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 421, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 468.953125, "epoch": 0.029041359851352283, "grad_norm": 0.9597918609876808, "kl": 0.29296875, "learning_rate": 9.97920435606262e-07, "loss": 0.0, "reward": 2.50606107711792, "reward_std": 0.06398443877696991, "rewards/accuracy_reward": 0.8341860771179199, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 422, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 486.359375, "epoch": 0.02911017823962563, "grad_norm": 2.6739670841263887, "kl": 0.333984375, "learning_rate": 9.979105750515635e-07, "loss": -0.0, "reward": 2.285937547683716, "reward_std": 0.2783728837966919, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4453125, "step": 423, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 481.40625, "epoch": 0.029178996627898975, "grad_norm": 0.7811755506157874, "kl": 0.2890625, "learning_rate": 9.97900691223451e-07, "loss": -0.0, "reward": 1.9187500476837158, "reward_std": 0.22724725306034088, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 424, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 503.203125, "epoch": 0.02924781501617232, "grad_norm": 0.7636792298662143, "kl": 0.328125, "learning_rate": 9.97890784122387e-07, "loss": 0.0, "reward": 1.9562500715255737, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 425, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 509.734375, "epoch": 0.02931663340444567, "grad_norm": 0.5058842751105815, "kl": 0.28515625, "learning_rate": 9.978808537488344e-07, "loss": -0.0, "reward": 2.5875000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 426, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 489.390625, "epoch": 0.029385451792719015, "grad_norm": 0.6376462990177204, "kl": 0.255859375, "learning_rate": 9.978709001032575e-07, "loss": 0.0, "reward": 2.3733205795288086, "reward_std": 0.02785518579185009, "rewards/accuracy_reward": 0.7233206033706665, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 427, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 530.65625, "epoch": 0.02945427018099236, "grad_norm": 1.9534588302651124, "kl": 0.310546875, "learning_rate": 9.978609231861213e-07, "loss": 0.0, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 428, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 480.46875, "epoch": 0.029523088569265707, "grad_norm": 0.9161945068386557, "kl": 0.291015625, "learning_rate": 9.978509229978923e-07, "loss": 0.0, "reward": 2.474921464920044, "reward_std": 0.003285577055066824, "rewards/accuracy_reward": 0.7999213933944702, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 429, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 488.25, "epoch": 0.029591906957539056, "grad_norm": 4.528023388549742, "kl": 0.33203125, "learning_rate": 9.97840899539038e-07, "loss": 0.0, "reward": 2.3874998092651367, "reward_std": 0.21370112895965576, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.46875, "step": 430, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 524.96875, "epoch": 0.029660725345812402, "grad_norm": 0.6436937128507753, "kl": 0.33984375, "learning_rate": 9.978308528100268e-07, "loss": -0.0, "reward": 2.477247714996338, "reward_std": 0.13611698150634766, "rewards/accuracy_reward": 0.81474769115448, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 431, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 502.546875, "epoch": 0.029729543734085748, "grad_norm": 0.9300191233258396, "kl": 0.322265625, "learning_rate": 9.978207828113284e-07, "loss": 0.0, "reward": 1.9953124523162842, "reward_std": 0.15308688580989838, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0234375, "step": 432, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 491.890625, "epoch": 0.029798362122359093, "grad_norm": 1.0668558374909718, "kl": 0.28125, "learning_rate": 9.978106895434133e-07, "loss": 0.0, "reward": 2.3968749046325684, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 433, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 517.609375, "epoch": 0.029867180510632443, "grad_norm": 1.4697332724315209, "kl": 0.263671875, "learning_rate": 9.978005730067533e-07, "loss": 0.0, "reward": 1.9865918159484863, "reward_std": 0.12739908695220947, "rewards/accuracy_reward": 0.3850294351577759, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 434, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 485.828125, "epoch": 0.02993599889890579, "grad_norm": 1.484179791788673, "kl": 0.361328125, "learning_rate": 9.977904332018216e-07, "loss": 0.0, "reward": 1.9728425741195679, "reward_std": 0.27237945795059204, "rewards/accuracy_reward": 0.7978423833847046, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 435, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 500.765625, "epoch": 0.030004817287179134, "grad_norm": 0.9472817014066167, "kl": 0.33984375, "learning_rate": 9.97780270129092e-07, "loss": 0.0, "reward": 1.7876174449920654, "reward_std": 0.13920891284942627, "rewards/accuracy_reward": 0.6407424211502075, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 436, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 527.796875, "epoch": 0.03007363567545248, "grad_norm": 1.916128493645729, "kl": 0.283203125, "learning_rate": 9.977700837890396e-07, "loss": -0.0, "reward": 1.8456275463104248, "reward_std": 0.29665374755859375, "rewards/accuracy_reward": 0.7050024271011353, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 437, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 486.15625, "epoch": 0.03014245406372583, "grad_norm": 0.9973749742059235, "kl": 0.3046875, "learning_rate": 9.9775987418214e-07, "loss": 0.0, "reward": 2.265625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 438, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 507.59375, "epoch": 0.030211272451999175, "grad_norm": 1.3384489771314194, "kl": 0.345703125, "learning_rate": 9.977496413088712e-07, "loss": 0.0, "reward": 1.8500034809112549, "reward_std": 0.25263768434524536, "rewards/accuracy_reward": 0.7031283974647522, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 439, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 483.921875, "epoch": 0.03028009084027252, "grad_norm": 1.6031242873901899, "kl": 0.3515625, "learning_rate": 9.97739385169711e-07, "loss": -0.0, "reward": 2.034374952316284, "reward_std": 0.2514057159423828, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.34375, "step": 440, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 535.46875, "epoch": 0.030348909228545867, "grad_norm": 0.35732048933408966, "kl": 0.3203125, "learning_rate": 9.977291057651388e-07, "loss": 0.0, "reward": 2.6656250953674316, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 441, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 507.109375, "epoch": 0.030417727616819216, "grad_norm": 1.5265414382619087, "kl": 0.330078125, "learning_rate": 9.977188030956354e-07, "loss": 0.0, "reward": 1.9278526306152344, "reward_std": 0.259934663772583, "rewards/accuracy_reward": 0.7653525471687317, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 442, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 477.328125, "epoch": 0.03048654600509256, "grad_norm": 1.3112103239083557, "kl": 0.34375, "learning_rate": 9.977084771616822e-07, "loss": 0.0, "reward": 2.354687452316284, "reward_std": 0.1817484200000763, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4609375, "step": 443, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 490.203125, "epoch": 0.030555364393365907, "grad_norm": 9.729786231292904, "kl": 0.337890625, "learning_rate": 9.976981279637618e-07, "loss": 0.0, "reward": 2.325000047683716, "reward_std": 0.13887302577495575, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 444, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 490.046875, "epoch": 0.030624182781639253, "grad_norm": 0.8037625510114076, "kl": 0.333984375, "learning_rate": 9.97687755502358e-07, "loss": -0.0, "reward": 2.5015625953674316, "reward_std": 0.1204654723405838, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4140625, "step": 445, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 470.171875, "epoch": 0.030693001169912602, "grad_norm": 0.5142592050524716, "kl": 0.318359375, "learning_rate": 9.976773597779554e-07, "loss": 0.0, "reward": 2.106250047683716, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 446, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 502.734375, "epoch": 0.030761819558185948, "grad_norm": 0.7994674749642238, "kl": 0.330078125, "learning_rate": 9.976669407910406e-07, "loss": 0.0, "reward": 1.9765467643737793, "reward_std": 0.06988290697336197, "rewards/accuracy_reward": 0.7952966690063477, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 447, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 479.171875, "epoch": 0.030830637946459294, "grad_norm": 1.2910882200633633, "kl": 0.34375, "learning_rate": 9.976564985420996e-07, "loss": -0.0, "reward": 2.3172202110290527, "reward_std": 0.08334086835384369, "rewards/accuracy_reward": 0.7062826156616211, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4453125, "step": 448, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 465.3125, "epoch": 0.03089945633473264, "grad_norm": 1.0195418442927122, "kl": 0.326171875, "learning_rate": 9.976460330316214e-07, "loss": 0.0, "reward": 2.398799419403076, "reward_std": 0.09521268308162689, "rewards/accuracy_reward": 0.7503616809844971, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4765625, "step": 449, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 507.984375, "epoch": 0.03096827472300599, "grad_norm": 1.183300665599282, "kl": 0.2734375, "learning_rate": 9.976355442600948e-07, "loss": 0.0, "reward": 1.7495415210723877, "reward_std": 0.13579124212265015, "rewards/accuracy_reward": 0.6432914733886719, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 450, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 497.515625, "epoch": 0.031037093111279335, "grad_norm": 3.895175076489522, "kl": 0.2734375, "learning_rate": 9.9762503222801e-07, "loss": -0.0, "reward": 2.106264114379883, "reward_std": 0.27943921089172363, "rewards/accuracy_reward": 0.4812641739845276, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 451, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 499.109375, "epoch": 0.03110591149955268, "grad_norm": 0.8697928385644692, "kl": 0.30078125, "learning_rate": 9.976144969358584e-07, "loss": 0.0, "reward": 2.3171873092651367, "reward_std": 0.2221989780664444, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 452, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 484.234375, "epoch": 0.031174729887826026, "grad_norm": 1.2332311017427307, "kl": 0.32421875, "learning_rate": 9.976039383841325e-07, "loss": -0.0, "reward": 1.7835055589675903, "reward_std": 0.1243206039071083, "rewards/accuracy_reward": 0.6491305828094482, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 453, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 512.296875, "epoch": 0.031243548276099375, "grad_norm": 1.1001362306355738, "kl": 0.33203125, "learning_rate": 9.97593356573326e-07, "loss": 0.0, "reward": 1.8541219234466553, "reward_std": 0.2170049399137497, "rewards/accuracy_reward": 0.7134968042373657, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 454, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 475.296875, "epoch": 0.03131236666437272, "grad_norm": 1.0242955376627567, "kl": 0.31640625, "learning_rate": 9.975827515039331e-07, "loss": -0.0, "reward": 2.323371171951294, "reward_std": 0.14022743701934814, "rewards/accuracy_reward": 0.6608712077140808, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 455, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 486.171875, "epoch": 0.03138118505264607, "grad_norm": 1.0377401008684661, "kl": 0.314453125, "learning_rate": 9.975721231764499e-07, "loss": 0.0, "reward": 2.020650863647461, "reward_std": 0.14942651987075806, "rewards/accuracy_reward": 0.8362758159637451, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 456, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 463.4375, "epoch": 0.031450003440919416, "grad_norm": 1.061152639739024, "kl": 0.33203125, "learning_rate": 9.975614715913729e-07, "loss": 0.0, "reward": 2.4625000953674316, "reward_std": 0.17218002676963806, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.46875, "step": 457, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 511.765625, "epoch": 0.03151882182919276, "grad_norm": 1.1719272003508443, "kl": 0.294921875, "learning_rate": 9.975507967492002e-07, "loss": 0.0, "reward": 2.1281707286834717, "reward_std": 0.06588680297136307, "rewards/accuracy_reward": 0.9312957525253296, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 458, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 466.15625, "epoch": 0.03158764021746611, "grad_norm": 1.2409293329711535, "kl": 0.291015625, "learning_rate": 9.975400986504306e-07, "loss": -0.0, "reward": 1.9187499284744263, "reward_std": 0.21085786819458008, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 459, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 488.890625, "epoch": 0.03165645860573946, "grad_norm": 2.721793775566521, "kl": 0.2734375, "learning_rate": 9.975293772955642e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 460, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 482.859375, "epoch": 0.0317252769940128, "grad_norm": 0.8893688134031119, "kl": 0.341796875, "learning_rate": 9.975186326851023e-07, "loss": -0.0, "reward": 1.8321998119354248, "reward_std": 0.14531125128269196, "rewards/accuracy_reward": 0.6853247880935669, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 461, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 497.296875, "epoch": 0.03179409538228615, "grad_norm": 2.050409964302167, "kl": 0.279296875, "learning_rate": 9.975078648195469e-07, "loss": 0.0, "reward": 2.0676217079162598, "reward_std": 0.09789103269577026, "rewards/accuracy_reward": 0.8801218271255493, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 462, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 461.546875, "epoch": 0.03186291377055949, "grad_norm": 0.538654071865163, "kl": 0.3203125, "learning_rate": 9.974970736994013e-07, "loss": -0.0, "reward": 2.483419895172119, "reward_std": 0.0172110702842474, "rewards/accuracy_reward": 0.8084197640419006, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 463, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 479.140625, "epoch": 0.03193173215883284, "grad_norm": 0.0, "kl": 0.2890625, "learning_rate": 9.9748625932517e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 464, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 497.25, "epoch": 0.03200055054710619, "grad_norm": 0.5770109999990225, "kl": 0.330078125, "learning_rate": 9.974754216973587e-07, "loss": 0.0, "reward": 1.9968750476837158, "reward_std": 0.15026019513607025, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 465, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 483.390625, "epoch": 0.03206936893537953, "grad_norm": 153.9462316240275, "kl": 0.28125, "learning_rate": 9.974645608164737e-07, "loss": -0.0, "reward": 1.9216933250427246, "reward_std": 0.13420812785625458, "rewards/accuracy_reward": 0.7560684680938721, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 466, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 491.171875, "epoch": 0.03213818732365288, "grad_norm": 1.416567324019278, "kl": 0.279296875, "learning_rate": 9.974536766830227e-07, "loss": 0.0, "reward": 2.1812500953674316, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 467, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 487.890625, "epoch": 0.03220700571192623, "grad_norm": 0.5484203301019633, "kl": 0.357421875, "learning_rate": 9.974427692975142e-07, "loss": 0.0, "reward": 1.975000023841858, "reward_std": 0.08017838001251221, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 468, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 517.765625, "epoch": 0.03227582410019957, "grad_norm": 1.0666890725977274, "kl": 0.25390625, "learning_rate": 9.974318386604585e-07, "loss": 0.0, "reward": 2.1967732906341553, "reward_std": 0.0968112200498581, "rewards/accuracy_reward": 0.5561482906341553, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 469, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 471.28125, "epoch": 0.03234464248847292, "grad_norm": 0.8367552959115133, "kl": 0.298828125, "learning_rate": 9.974208847723665e-07, "loss": -0.0, "reward": 2.138099431991577, "reward_std": 0.008674152195453644, "rewards/accuracy_reward": 0.9380993843078613, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 470, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 532.796875, "epoch": 0.032413460876746264, "grad_norm": 0.6198905303418882, "kl": 0.296875, "learning_rate": 9.974099076337499e-07, "loss": 0.0, "reward": 1.9968749284744263, "reward_std": 0.15026019513607025, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 471, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 525.578125, "epoch": 0.03248227926501961, "grad_norm": 1.5967336290289098, "kl": 0.2734375, "learning_rate": 9.973989072451217e-07, "loss": 0.0, "reward": 2.339062452316284, "reward_std": 0.17235727608203888, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 472, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 529.8125, "epoch": 0.03255109765329296, "grad_norm": 1.0472349027693757, "kl": 0.302734375, "learning_rate": 9.973878836069967e-07, "loss": -0.0, "reward": 2.1420555114746094, "reward_std": 0.22773945331573486, "rewards/accuracy_reward": 0.5998678207397461, "rewards/format_reward": 0.921875, "rewards/transform_reward": 0.4921875, "step": 473, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 470.1875, "epoch": 0.032619916041566305, "grad_norm": 0.0, "kl": 0.326171875, "learning_rate": 9.973768367198895e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 474, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 548.734375, "epoch": 0.032688734429839654, "grad_norm": 1.0209759284303774, "kl": 0.318359375, "learning_rate": 9.973657665843169e-07, "loss": 0.0, "reward": 2.6578125953674316, "reward_std": 0.11932426691055298, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 475, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 467.359375, "epoch": 0.032757552818113, "grad_norm": 0.5247254738990947, "kl": 0.341796875, "learning_rate": 9.973546732007961e-07, "loss": 0.0, "reward": 2.5343751907348633, "reward_std": 0.0289318785071373, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.484375, "step": 476, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 506.359375, "epoch": 0.032826371206386346, "grad_norm": 0.6566120221445747, "kl": 0.34765625, "learning_rate": 9.973435565698456e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.07312604784965515, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.484375, "step": 477, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 489.84375, "epoch": 0.032895189594659695, "grad_norm": 0.0, "kl": 0.3125, "learning_rate": 9.973324166919853e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 478, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 520.953125, "epoch": 0.03296400798293304, "grad_norm": 0.8266656972221456, "kl": 0.29296875, "learning_rate": 9.973212535677356e-07, "loss": -0.0, "reward": 1.9987212419509888, "reward_std": 0.12665869295597076, "rewards/accuracy_reward": 0.8205963373184204, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 479, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 490.015625, "epoch": 0.033032826371206386, "grad_norm": 0.43524286960889375, "kl": 0.322265625, "learning_rate": 9.973100671976185e-07, "loss": -0.0, "reward": 2.5796875953674316, "reward_std": 0.07732626795768738, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 480, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 484.109375, "epoch": 0.033101644759479736, "grad_norm": 1.9482758690067095, "kl": 0.291015625, "learning_rate": 9.972988575821566e-07, "loss": 0.0, "reward": 1.9375, "reward_std": 0.14961488544940948, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 481, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 466.625, "epoch": 0.03317046314775308, "grad_norm": 0.9017513881711767, "kl": 0.34765625, "learning_rate": 9.972876247218743e-07, "loss": 0.0, "reward": 2.342533588409424, "reward_std": 0.09754247963428497, "rewards/accuracy_reward": 0.7019084095954895, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.46875, "step": 482, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 490.3125, "epoch": 0.03323928153602643, "grad_norm": 0.682082412618795, "kl": 0.326171875, "learning_rate": 9.972763686172962e-07, "loss": -0.0, "reward": 1.9409794807434082, "reward_std": 0.07338672131299973, "rewards/accuracy_reward": 0.7722294330596924, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 483, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 476.5625, "epoch": 0.033308099924299776, "grad_norm": 0.5817728703315983, "kl": 0.29296875, "learning_rate": 9.972650892689487e-07, "loss": 0.0, "reward": 2.325000047683716, "reward_std": 0.1306653916835785, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 484, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 482.1875, "epoch": 0.03337691831257312, "grad_norm": 0.5941455224130741, "kl": 0.267578125, "learning_rate": 9.97253786677359e-07, "loss": -0.0, "reward": 2.4375, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 485, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 476.875, "epoch": 0.03344573670084647, "grad_norm": 0.9085275944457877, "kl": 0.34375, "learning_rate": 9.972424608430553e-07, "loss": -0.0, "reward": 2.2592101097106934, "reward_std": 0.16982048749923706, "rewards/accuracy_reward": 0.6779599189758301, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 486, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 481.40625, "epoch": 0.03351455508911981, "grad_norm": 1.5318406772473383, "kl": 0.326171875, "learning_rate": 9.97231111766567e-07, "loss": 0.0, "reward": 2.106250047683716, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 487, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 481.96875, "epoch": 0.03358337347739316, "grad_norm": 0.0, "kl": 0.271484375, "learning_rate": 9.972197394484248e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 488, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 491.828125, "epoch": 0.03365219186566651, "grad_norm": 0.5797423202413758, "kl": 0.26953125, "learning_rate": 9.972083438891602e-07, "loss": 0.0, "reward": 2.1812500953674316, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 489, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 462.15625, "epoch": 0.03372101025393985, "grad_norm": 0.4726534722246735, "kl": 0.310546875, "learning_rate": 9.971969250893054e-07, "loss": -0.0, "reward": 2.0875000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 490, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 484.234375, "epoch": 0.0337898286422132, "grad_norm": 0.637831868403718, "kl": 0.2890625, "learning_rate": 9.971854830493946e-07, "loss": -0.0, "reward": 1.9375, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 491, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 478.046875, "epoch": 0.03385864703048655, "grad_norm": 0.5988406814253711, "kl": 0.296875, "learning_rate": 9.971740177699625e-07, "loss": 0.0, "reward": 2.1624999046325684, "reward_std": 0.1060660183429718, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 492, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 468.5625, "epoch": 0.03392746541875989, "grad_norm": 1.052460628002109, "kl": 0.37890625, "learning_rate": 9.971625292515452e-07, "loss": -0.0, "reward": 2.276411294937134, "reward_std": 0.12224727123975754, "rewards/accuracy_reward": 0.6717237830162048, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4453125, "step": 493, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 471.953125, "epoch": 0.03399628380703324, "grad_norm": 0.8805159943740547, "kl": 0.3125, "learning_rate": 9.971510174946794e-07, "loss": -0.0, "reward": 2.0738601684570312, "reward_std": 0.022686436772346497, "rewards/accuracy_reward": 0.8738600015640259, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 494, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 487.9375, "epoch": 0.03406510219530658, "grad_norm": 0.6347036473242256, "kl": 0.26953125, "learning_rate": 9.971394824999034e-07, "loss": -0.0, "reward": 2.453915596008301, "reward_std": 0.056642595678567886, "rewards/accuracy_reward": 0.7820405960083008, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 495, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 479.1875, "epoch": 0.03413392058357993, "grad_norm": 0.0, "kl": 0.30078125, "learning_rate": 9.971279242677562e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 496, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 490.21875, "epoch": 0.03420273897185328, "grad_norm": 2.534288497940674, "kl": 0.306640625, "learning_rate": 9.97116342798778e-07, "loss": 0.0, "reward": 2.009450912475586, "reward_std": 0.13551011681556702, "rewards/accuracy_reward": 0.8282008171081543, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 497, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 461.96875, "epoch": 0.034271557360126624, "grad_norm": 2.3647193750968034, "kl": 0.287109375, "learning_rate": 9.971047380935105e-07, "loss": -0.0, "reward": 2.039215087890625, "reward_std": 0.017797298729419708, "rewards/accuracy_reward": 0.8392151594161987, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 498, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 439.515625, "epoch": 0.03434037574839997, "grad_norm": 1.1828664147646253, "kl": 0.32421875, "learning_rate": 9.97093110152496e-07, "loss": 0.0, "reward": 1.9648656845092773, "reward_std": 0.038125697523355484, "rewards/accuracy_reward": 0.7711156606674194, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 499, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 481.0, "epoch": 0.03440919413667332, "grad_norm": 0.9949814286760928, "kl": 0.279296875, "learning_rate": 9.970814589762777e-07, "loss": 0.0, "reward": 1.9666038751602173, "reward_std": 0.09264449030160904, "rewards/accuracy_reward": 0.7853537797927856, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 500, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 482.640625, "epoch": 0.034478012524946665, "grad_norm": 0.7436069917699148, "kl": 0.28125, "learning_rate": 9.970697845654007e-07, "loss": -0.0, "reward": 2.418750047683716, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 501, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 443.125, "epoch": 0.034546830913220014, "grad_norm": 1.5043389046204034, "kl": 0.40234375, "learning_rate": 9.970580869204099e-07, "loss": -0.0, "reward": 1.881250023841858, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 502, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 466.203125, "epoch": 0.034615649301493356, "grad_norm": 1.4441168949993644, "kl": 0.359375, "learning_rate": 9.97046366041853e-07, "loss": -0.0, "reward": 2.3053088188171387, "reward_std": 0.31812071800231934, "rewards/accuracy_reward": 0.6584336757659912, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 503, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 454.109375, "epoch": 0.034684467689766706, "grad_norm": 0.0, "kl": 0.328125, "learning_rate": 9.970346219302775e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 504, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 472.15625, "epoch": 0.034753286078040055, "grad_norm": 0.7347561228025072, "kl": 0.361328125, "learning_rate": 9.97022854586232e-07, "loss": 0.0, "reward": 1.8149936199188232, "reward_std": 0.0038941337261348963, "rewards/accuracy_reward": 0.6649935245513916, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 505, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 446.9375, "epoch": 0.0348221044663134, "grad_norm": 1.6105661282996295, "kl": 0.34375, "learning_rate": 9.97011064010267e-07, "loss": 0.0, "reward": 2.121875047683716, "reward_std": 0.12855204939842224, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 506, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 459.828125, "epoch": 0.034890922854586746, "grad_norm": 0.0, "kl": 0.306640625, "learning_rate": 9.969992502029334e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 507, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 413.953125, "epoch": 0.034959741242860096, "grad_norm": 0.6155286727422034, "kl": 0.353515625, "learning_rate": 9.969874131647834e-07, "loss": 0.0, "reward": 2.264312744140625, "reward_std": 0.009242966771125793, "rewards/accuracy_reward": 0.7393127083778381, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 508, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 488.515625, "epoch": 0.03502855963113344, "grad_norm": 0.792899762425931, "kl": 0.296875, "learning_rate": 9.969755528963703e-07, "loss": 0.0, "reward": 1.9671849012374878, "reward_std": 0.10174080729484558, "rewards/accuracy_reward": 0.7953099608421326, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 509, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 465.78125, "epoch": 0.03509737801940679, "grad_norm": 2.5662538662084713, "kl": 0.322265625, "learning_rate": 9.969636693982486e-07, "loss": -0.0, "reward": 2.3658225536346436, "reward_std": 0.25714412331581116, "rewards/accuracy_reward": 0.6970723867416382, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 510, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 461.671875, "epoch": 0.03516619640768013, "grad_norm": 0.8925656835421799, "kl": 0.3828125, "learning_rate": 9.969517626709736e-07, "loss": -0.0, "reward": 2.537606716156006, "reward_std": 0.08157504349946976, "rewards/accuracy_reward": 0.8532317280769348, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 511, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 473.953125, "epoch": 0.03523501479595348, "grad_norm": 2.915635456973238, "kl": 0.28125, "learning_rate": 9.969398327151019e-07, "loss": 0.0, "reward": 2.231250047683716, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 512, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 475.84375, "epoch": 0.03530383318422683, "grad_norm": 1.6600960448185107, "kl": 0.3203125, "learning_rate": 9.969278795311913e-07, "loss": 0.0, "reward": 1.9268224239349365, "reward_std": 0.27738672494888306, "rewards/accuracy_reward": 0.7736974954605103, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 513, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 441.828125, "epoch": 0.03537265157250017, "grad_norm": 0.41443261914450297, "kl": 0.31640625, "learning_rate": 9.969159031198002e-07, "loss": 0.0, "reward": 2.6812500953674316, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 514, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 454.4375, "epoch": 0.03544146996077352, "grad_norm": 0.660127848197739, "kl": 0.306640625, "learning_rate": 9.969039034814887e-07, "loss": -0.0, "reward": 1.8406250476837158, "reward_std": 0.12437255680561066, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 515, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 447.1875, "epoch": 0.03551028834904687, "grad_norm": 1.9541011405364679, "kl": 0.330078125, "learning_rate": 9.968918806168174e-07, "loss": 0.0, "reward": 2.528125047683716, "reward_std": 0.11363068222999573, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 516, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 446.21875, "epoch": 0.03557910673732021, "grad_norm": 1.210829955959062, "kl": 0.3984375, "learning_rate": 9.968798345263484e-07, "loss": -0.0, "reward": 2.524569511413574, "reward_std": 0.15448427200317383, "rewards/accuracy_reward": 0.894882082939148, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4453125, "step": 517, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 464.9375, "epoch": 0.03564792512559356, "grad_norm": 0.4663396435064515, "kl": 0.37890625, "learning_rate": 9.96867765210645e-07, "loss": -0.0, "reward": 2.0875000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 518, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 469.015625, "epoch": 0.0357167435138669, "grad_norm": 0.4715141096546146, "kl": 0.28515625, "learning_rate": 9.96855672670271e-07, "loss": -0.0, "reward": 1.9375, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 519, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 458.234375, "epoch": 0.03578556190214025, "grad_norm": 0.5239659499852986, "kl": 0.291015625, "learning_rate": 9.968435569057918e-07, "loss": 0.0, "reward": 2.3812499046325684, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 520, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 428.09375, "epoch": 0.0358543802904136, "grad_norm": 1.0001490420450643, "kl": 0.337890625, "learning_rate": 9.968314179177736e-07, "loss": -0.0, "reward": 1.8406250476837158, "reward_std": 0.12437255680561066, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 521, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 446.609375, "epoch": 0.03592319867868694, "grad_norm": 0.912330163294034, "kl": 0.32421875, "learning_rate": 9.96819255706784e-07, "loss": -0.0, "reward": 2.0133566856384277, "reward_std": 0.07775124907493591, "rewards/accuracy_reward": 0.8321065902709961, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 522, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 475.9375, "epoch": 0.03599201706696029, "grad_norm": 2.4218220312974297, "kl": 0.28515625, "learning_rate": 9.968070702733915e-07, "loss": -0.0, "reward": 2.2887964248657227, "reward_std": 0.2157672643661499, "rewards/accuracy_reward": 0.6450462341308594, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 523, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 450.21875, "epoch": 0.03606083545523364, "grad_norm": 1.3842177604851469, "kl": 0.40625, "learning_rate": 9.967948616181653e-07, "loss": -0.0, "reward": 2.0121357440948486, "reward_std": 0.09374725818634033, "rewards/accuracy_reward": 0.8183857202529907, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 524, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 450.40625, "epoch": 0.036129653843506984, "grad_norm": 1.5127519880291593, "kl": 0.3984375, "learning_rate": 9.967826297416764e-07, "loss": -0.0, "reward": 1.8359375, "reward_std": 0.17404134571552277, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.2734375, "step": 525, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 402.40625, "epoch": 0.03619847223178033, "grad_norm": 1.7729848765355738, "kl": 0.3359375, "learning_rate": 9.967703746444964e-07, "loss": 0.0, "reward": 1.8156249523162842, "reward_std": 0.16666369140148163, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 526, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 462.734375, "epoch": 0.036267290620053676, "grad_norm": 0.0, "kl": 0.28515625, "learning_rate": 9.967580963271982e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 527, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 427.0625, "epoch": 0.036336109008327025, "grad_norm": 0.7977336415399223, "kl": 0.390625, "learning_rate": 9.967457947903557e-07, "loss": 0.0, "reward": 2.520312547683716, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3203125, "step": 528, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 416.96875, "epoch": 0.036404927396600374, "grad_norm": 1.0735524946304762, "kl": 0.3984375, "learning_rate": 9.96733470034544e-07, "loss": 0.0, "reward": 2.472651481628418, "reward_std": 0.09958969056606293, "rewards/accuracy_reward": 0.8539016246795654, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 529, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 413.28125, "epoch": 0.036473745784873716, "grad_norm": 1.1444578657508109, "kl": 0.33203125, "learning_rate": 9.96721122060339e-07, "loss": 0.0, "reward": 1.9024862051010132, "reward_std": 0.10636601597070694, "rewards/accuracy_reward": 0.7181111574172974, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 530, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 441.546875, "epoch": 0.036542564173147066, "grad_norm": 0.77185971433627, "kl": 0.3125, "learning_rate": 9.96708750868318e-07, "loss": -0.0, "reward": 2.008251905441284, "reward_std": 0.05114637687802315, "rewards/accuracy_reward": 0.833251953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 531, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 423.953125, "epoch": 0.036611382561420415, "grad_norm": 0.0, "kl": 0.318359375, "learning_rate": 9.966963564590589e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 532, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 469.609375, "epoch": 0.03668020094969376, "grad_norm": 1.5382691195395533, "kl": 0.361328125, "learning_rate": 9.966839388331417e-07, "loss": -0.0, "reward": 2.186058521270752, "reward_std": 0.20805513858795166, "rewards/accuracy_reward": 0.5657459497451782, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4765625, "step": 533, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 431.171875, "epoch": 0.036749019337967107, "grad_norm": 0.9923736454504984, "kl": 0.32421875, "learning_rate": 9.966714979911463e-07, "loss": 0.0, "reward": 2.0093750953674316, "reward_std": 0.21968260407447815, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 534, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 394.59375, "epoch": 0.03681783772624045, "grad_norm": 1.02933339751459, "kl": 0.37109375, "learning_rate": 9.966590339336544e-07, "loss": 0.0, "reward": 2.2578125, "reward_std": 0.17816388607025146, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4765625, "step": 535, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 452.984375, "epoch": 0.0368866561145138, "grad_norm": 1.2822056345375774, "kl": 0.33984375, "learning_rate": 9.966465466612485e-07, "loss": -0.0, "reward": 2.3364455699920654, "reward_std": 0.05816802754998207, "rewards/accuracy_reward": 0.6614454984664917, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 536, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 460.515625, "epoch": 0.03695547450278715, "grad_norm": 0.8095687340140045, "kl": 0.322265625, "learning_rate": 9.966340361745126e-07, "loss": 0.0, "reward": 1.9187500476837158, "reward_std": 0.12246951460838318, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 537, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 410.265625, "epoch": 0.03702429289106049, "grad_norm": 0.45758076720277024, "kl": 0.40234375, "learning_rate": 9.966215024740312e-07, "loss": 0.0, "reward": 2.231250047683716, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 538, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 433.921875, "epoch": 0.03709311127933384, "grad_norm": 1.1105103674927537, "kl": 0.390625, "learning_rate": 9.9660894556039e-07, "loss": -0.0, "reward": 2.419909954071045, "reward_std": 0.05898282304406166, "rewards/accuracy_reward": 0.7917850017547607, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.453125, "step": 539, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 472.484375, "epoch": 0.03716192966760719, "grad_norm": 1.768938608911262, "kl": 0.3359375, "learning_rate": 9.965963654341763e-07, "loss": -0.0, "reward": 2.527395248413086, "reward_std": 0.13908490538597107, "rewards/accuracy_reward": 0.8430201411247253, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 540, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 405.640625, "epoch": 0.03723074805588053, "grad_norm": 0.6419723083435366, "kl": 0.353515625, "learning_rate": 9.965837620959778e-07, "loss": -0.0, "reward": 1.975000023841858, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 541, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 423.8125, "epoch": 0.03729956644415388, "grad_norm": 1.4340642066835405, "kl": 0.369140625, "learning_rate": 9.96571135546384e-07, "loss": 0.0, "reward": 2.5562500953674316, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 542, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 480.0, "epoch": 0.03736838483242722, "grad_norm": 1.2793836069807518, "kl": 0.310546875, "learning_rate": 9.965584857859845e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.15526476502418518, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 543, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 484.8125, "epoch": 0.03743720322070057, "grad_norm": 1.177022578272915, "kl": 0.318359375, "learning_rate": 9.965458128153711e-07, "loss": -0.0, "reward": 2.340275764465332, "reward_std": 0.05094379186630249, "rewards/accuracy_reward": 0.6902757287025452, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 544, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 460.375, "epoch": 0.03750602160897392, "grad_norm": 0.7741904915813258, "kl": 0.390625, "learning_rate": 9.96533116635136e-07, "loss": -0.0, "reward": 2.0406999588012695, "reward_std": 0.07684572041034698, "rewards/accuracy_reward": 0.8594497442245483, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 545, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 448.0, "epoch": 0.03757483999724726, "grad_norm": 0.5541836556962658, "kl": 0.359375, "learning_rate": 9.965203972458726e-07, "loss": -0.0, "reward": 2.1437501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 546, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 488.234375, "epoch": 0.03764365838552061, "grad_norm": 0.45297410919832476, "kl": 0.37109375, "learning_rate": 9.965076546481754e-07, "loss": 0.0, "reward": 2.4156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 547, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 493.421875, "epoch": 0.03771247677379396, "grad_norm": 0.9744055539606824, "kl": 0.32421875, "learning_rate": 9.964948888426402e-07, "loss": -0.0, "reward": 2.03125, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 548, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 448.5625, "epoch": 0.037781295162067303, "grad_norm": 0.5216534225857056, "kl": 0.35546875, "learning_rate": 9.964820998298635e-07, "loss": -0.0, "reward": 2.4644525051116943, "reward_std": 0.006488873157650232, "rewards/accuracy_reward": 0.7894525527954102, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 549, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 514.625, "epoch": 0.03785011355034065, "grad_norm": 0.7356122093627219, "kl": 0.3046875, "learning_rate": 9.964692876104433e-07, "loss": -0.0, "reward": 1.75, "reward_std": 0.155264750123024, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 550, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 498.484375, "epoch": 0.037918931938613995, "grad_norm": 0.5366773009801842, "kl": 0.302734375, "learning_rate": 9.964564521849781e-07, "loss": 0.0, "reward": 2.0454983711242676, "reward_std": 0.012733045034110546, "rewards/accuracy_reward": 0.8704981803894043, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 551, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 537.65625, "epoch": 0.037987750326887344, "grad_norm": 1.1285029913321962, "kl": 0.3046875, "learning_rate": 9.964435935540683e-07, "loss": -0.0, "reward": 1.9128642082214355, "reward_std": 0.1767585277557373, "rewards/accuracy_reward": 0.806614100933075, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.0, "step": 552, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 504.203125, "epoch": 0.038056568715160694, "grad_norm": 0.2596771706361604, "kl": 0.30078125, "learning_rate": 9.964307117183147e-07, "loss": 0.0, "reward": 2.5078125, "reward_std": 0.11932426691055298, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 553, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 489.03125, "epoch": 0.038125387103434036, "grad_norm": 0.8653619006299922, "kl": 0.365234375, "learning_rate": 9.964178066783193e-07, "loss": -0.0, "reward": 2.4375, "reward_std": 0.12437254935503006, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3125, "step": 554, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 464.265625, "epoch": 0.038194205491707385, "grad_norm": 2.184788230817223, "kl": 0.341796875, "learning_rate": 9.964048784346855e-07, "loss": 0.0, "reward": 1.9562500715255737, "reward_std": 0.13321137428283691, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 555, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 466.359375, "epoch": 0.038263023879980734, "grad_norm": 1.1599412724943652, "kl": 0.376953125, "learning_rate": 9.963919269880176e-07, "loss": -0.0, "reward": 2.359375, "reward_std": 0.20970867574214935, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.421875, "step": 556, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 467.796875, "epoch": 0.03833184226825408, "grad_norm": 0.7114994404901774, "kl": 0.33984375, "learning_rate": 9.96378952338921e-07, "loss": -0.0, "reward": 2.012660264968872, "reward_std": 0.09784829616546631, "rewards/accuracy_reward": 0.8345352411270142, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 557, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 463.0625, "epoch": 0.038400660656527426, "grad_norm": 0.5396009236798051, "kl": 0.34375, "learning_rate": 9.96365954488002e-07, "loss": -0.0, "reward": 1.6733367443084717, "reward_std": 0.06279048323631287, "rewards/accuracy_reward": 0.5639616250991821, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 558, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 462.75, "epoch": 0.03846947904480077, "grad_norm": 0.657721245131848, "kl": 0.373046875, "learning_rate": 9.963529334358685e-07, "loss": 0.0, "reward": 1.9562500715255737, "reward_std": 0.1332113891839981, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 559, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 455.03125, "epoch": 0.03853829743307412, "grad_norm": 0.6900182593935333, "kl": 0.369140625, "learning_rate": 9.963398891831286e-07, "loss": 0.0, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 560, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 476.109375, "epoch": 0.03860711582134747, "grad_norm": 0.8295882358774979, "kl": 0.361328125, "learning_rate": 9.963268217303924e-07, "loss": -0.0, "reward": 2.3950815200805664, "reward_std": 0.0833289623260498, "rewards/accuracy_reward": 0.7325815558433533, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 561, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 497.65625, "epoch": 0.03867593420962081, "grad_norm": 0.5194697054073263, "kl": 0.345703125, "learning_rate": 9.963137310782705e-07, "loss": 0.0, "reward": 1.9826083183288574, "reward_std": 0.007911097258329391, "rewards/accuracy_reward": 0.8076083660125732, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 562, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 498.59375, "epoch": 0.03874475259789416, "grad_norm": 0.42731308409203855, "kl": 0.33203125, "learning_rate": 9.96300617227375e-07, "loss": 0.0, "reward": 2.6624999046325684, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 563, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 525.53125, "epoch": 0.03881357098616751, "grad_norm": 1.1713856589732445, "kl": 0.287109375, "learning_rate": 9.962874801783189e-07, "loss": 0.0, "reward": 2.410550117492676, "reward_std": 0.08196159452199936, "rewards/accuracy_reward": 0.7449252009391785, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 564, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 476.515625, "epoch": 0.03888238937444085, "grad_norm": 1.1817065193863567, "kl": 0.36328125, "learning_rate": 9.96274319931716e-07, "loss": -0.0, "reward": 2.125, "reward_std": 0.08017837256193161, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 565, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 490.78125, "epoch": 0.0389512077627142, "grad_norm": 8.272872745182601, "kl": 0.357421875, "learning_rate": 9.962611364881812e-07, "loss": -0.0, "reward": 2.4906249046325684, "reward_std": 0.12437255680561066, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 566, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 495.34375, "epoch": 0.03902002615098754, "grad_norm": 0.7179005082681366, "kl": 0.341796875, "learning_rate": 9.962479298483314e-07, "loss": -0.0, "reward": 2.325000047683716, "reward_std": 0.08017837256193161, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 567, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 500.375, "epoch": 0.03908884453926089, "grad_norm": 0.5723719373477691, "kl": 0.40625, "learning_rate": 9.962347000127836e-07, "loss": 0.0, "reward": 1.8781249523162842, "reward_std": 0.11363068222999573, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 568, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 511.609375, "epoch": 0.03915766292753424, "grad_norm": 1.1641714050396887, "kl": 0.369140625, "learning_rate": 9.96221446982156e-07, "loss": -0.0, "reward": 2.044980764389038, "reward_std": 0.18156588077545166, "rewards/accuracy_reward": 0.5887308120727539, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.34375, "step": 569, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 496.109375, "epoch": 0.03922648131580758, "grad_norm": 1.686853749857039, "kl": 0.380859375, "learning_rate": 9.962081707570684e-07, "loss": -0.0, "reward": 2.4671874046325684, "reward_std": 0.16097009181976318, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 570, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 503.078125, "epoch": 0.03929529970408093, "grad_norm": 0.0, "kl": 0.33984375, "learning_rate": 9.96194871338141e-07, "loss": 0.0, "reward": 2.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 571, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 472.9375, "epoch": 0.03936411809235428, "grad_norm": 0.0, "kl": 0.361328125, "learning_rate": 9.961815487259958e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 572, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 456.109375, "epoch": 0.03943293648062762, "grad_norm": 0.5952198312278759, "kl": 0.3828125, "learning_rate": 9.961682029212554e-07, "loss": -0.0, "reward": 1.803125023841858, "reward_std": 0.11363068222999573, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 573, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 483.40625, "epoch": 0.03950175486890097, "grad_norm": 1.1510967802066807, "kl": 0.34765625, "learning_rate": 9.961548339245435e-07, "loss": -0.0, "reward": 1.642585039138794, "reward_std": 0.229049414396286, "rewards/accuracy_reward": 0.5113350749015808, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 574, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 488.109375, "epoch": 0.039570573257174314, "grad_norm": 0.47130270880492825, "kl": 0.40625, "learning_rate": 9.961414417364854e-07, "loss": 0.0, "reward": 1.827186107635498, "reward_std": 0.005984932649880648, "rewards/accuracy_reward": 0.6771860122680664, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 575, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 478.28125, "epoch": 0.039639391645447664, "grad_norm": 53.07808723501445, "kl": 0.3359375, "learning_rate": 9.961280263577064e-07, "loss": 0.0, "reward": 1.824044942855835, "reward_std": 0.15130096673965454, "rewards/accuracy_reward": 0.680294930934906, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 576, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 479.40625, "epoch": 0.03970821003372101, "grad_norm": 0.4380519066898063, "kl": 0.337890625, "learning_rate": 9.96114587788834e-07, "loss": 0.0, "reward": 1.9562499523162842, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 577, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 449.0, "epoch": 0.039777028421994355, "grad_norm": 0.7734130503610411, "kl": 0.37109375, "learning_rate": 9.96101126030496e-07, "loss": 0.0, "reward": 2.0687499046325684, "reward_std": 0.12246951460838318, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 578, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 465.90625, "epoch": 0.039845846810267704, "grad_norm": 0.4702587017734914, "kl": 0.353515625, "learning_rate": 9.960876410833222e-07, "loss": -0.0, "reward": 1.9892168045043945, "reward_std": 0.005582153797149658, "rewards/accuracy_reward": 0.8142168521881104, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 579, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 496.46875, "epoch": 0.039914665198541054, "grad_norm": 0.9283325701545833, "kl": 0.404296875, "learning_rate": 9.960741329479428e-07, "loss": 0.0, "reward": 2.292858123779297, "reward_std": 0.05522081255912781, "rewards/accuracy_reward": 0.7975454330444336, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3203125, "step": 580, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 483.90625, "epoch": 0.039983483586814396, "grad_norm": 1.2602739949885695, "kl": 0.4140625, "learning_rate": 9.960606016249886e-07, "loss": -0.0, "reward": 1.9963542222976685, "reward_std": 0.08499806374311447, "rewards/accuracy_reward": 0.8307291269302368, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 581, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 476.09375, "epoch": 0.040052301975087745, "grad_norm": 0.7068473585716466, "kl": 0.380859375, "learning_rate": 9.960470471150929e-07, "loss": 0.0, "reward": 2.4652323722839355, "reward_std": 0.06480255722999573, "rewards/accuracy_reward": 0.7933573722839355, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 582, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 483.140625, "epoch": 0.04012112036336109, "grad_norm": 0.4603701873106538, "kl": 0.328125, "learning_rate": 9.960334694188885e-07, "loss": 0.0, "reward": 1.975672960281372, "reward_std": 0.008354305289685726, "rewards/accuracy_reward": 0.8006728291511536, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 583, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 504.96875, "epoch": 0.04018993875163444, "grad_norm": 1.0226502443693781, "kl": 0.310546875, "learning_rate": 9.960198685370106e-07, "loss": 0.0, "reward": 2.5619053840637207, "reward_std": 0.08562671393156052, "rewards/accuracy_reward": 0.8712801933288574, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 584, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 467.09375, "epoch": 0.040258757139907786, "grad_norm": 1.0208926718716902, "kl": 0.328125, "learning_rate": 9.960062444700947e-07, "loss": -0.0, "reward": 1.9494123458862305, "reward_std": 0.09056305140256882, "rewards/accuracy_reward": 0.7837873101234436, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 585, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 474.921875, "epoch": 0.04032757552818113, "grad_norm": 0.9616781370932352, "kl": 0.32421875, "learning_rate": 9.959925972187776e-07, "loss": -0.0, "reward": 1.7687500715255737, "reward_std": 0.1578107476234436, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 586, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 437.75, "epoch": 0.04039639391645448, "grad_norm": 0.8401080184742072, "kl": 0.384765625, "learning_rate": 9.959789267836976e-07, "loss": -0.0, "reward": 1.859375, "reward_std": 0.12182654440402985, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 587, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 443.4375, "epoch": 0.04046521230472783, "grad_norm": 0.5816806268398671, "kl": 0.376953125, "learning_rate": 9.959652331654931e-07, "loss": -0.0, "reward": 1.9720834493637085, "reward_std": 0.004357953555881977, "rewards/accuracy_reward": 0.7970833778381348, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 588, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 471.21875, "epoch": 0.04053403069300117, "grad_norm": 1.5408961747332055, "kl": 0.40625, "learning_rate": 9.959515163648046e-07, "loss": -0.0, "reward": 2.1872527599334717, "reward_std": 0.10292273759841919, "rewards/accuracy_reward": 0.6044400930404663, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4453125, "step": 589, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 454.90625, "epoch": 0.04060284908127452, "grad_norm": 0.6032057456186285, "kl": 0.35546875, "learning_rate": 9.959377763822726e-07, "loss": 0.0, "reward": 1.8935657739639282, "reward_std": 0.08421061933040619, "rewards/accuracy_reward": 0.7310656905174255, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 590, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 469.921875, "epoch": 0.04067166746954786, "grad_norm": 0.0, "kl": 0.34375, "learning_rate": 9.959240132185403e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 591, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 469.359375, "epoch": 0.04074048585782121, "grad_norm": 0.7332933603300845, "kl": 0.427734375, "learning_rate": 9.959102268742503e-07, "loss": 0.0, "reward": 1.845790147781372, "reward_std": 0.08152438700199127, "rewards/accuracy_reward": 0.6895401477813721, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 592, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 476.953125, "epoch": 0.04080930424609456, "grad_norm": 0.4096169783840116, "kl": 0.431640625, "learning_rate": 9.958964173500475e-07, "loss": -0.0, "reward": 2.5938658714294434, "reward_std": 0.00137225235812366, "rewards/accuracy_reward": 0.893865704536438, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 593, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 424.25, "epoch": 0.0408781226343679, "grad_norm": 0.6300630350555169, "kl": 0.349609375, "learning_rate": 9.958825846465769e-07, "loss": 0.0, "reward": 1.9291133880615234, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.7697381973266602, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 594, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 436.90625, "epoch": 0.04094694102264125, "grad_norm": 0.9848365890818339, "kl": 0.373046875, "learning_rate": 9.958687287644853e-07, "loss": 0.0, "reward": 2.5843749046325684, "reward_std": 0.07827534526586533, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 595, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 475.953125, "epoch": 0.0410157594109146, "grad_norm": 1.86803400224248, "kl": 0.322265625, "learning_rate": 9.958548497044204e-07, "loss": 0.0, "reward": 2.223437547683716, "reward_std": 0.16351844370365143, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 596, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 486.53125, "epoch": 0.04108457779918794, "grad_norm": 0.0, "kl": 0.3984375, "learning_rate": 9.95840947467031e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 597, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 473.203125, "epoch": 0.04115339618746129, "grad_norm": 0.0, "kl": 0.35546875, "learning_rate": 9.958270220529665e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 598, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.375, "completion_length": 431.890625, "epoch": 0.041222214575734634, "grad_norm": 0.9782881800584008, "kl": 0.38671875, "learning_rate": 9.958130734628785e-07, "loss": -0.0, "reward": 1.5980242490768433, "reward_std": 0.09185099601745605, "rewards/accuracy_reward": 0.48239922523498535, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 599, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 505.25, "epoch": 0.04129103296400798, "grad_norm": 1.6073923053977994, "kl": 0.390625, "learning_rate": 9.957991016974184e-07, "loss": -0.0, "reward": 2.4196364879608154, "reward_std": 0.1485508680343628, "rewards/accuracy_reward": 0.8118237257003784, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4140625, "step": 600, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 430.46875, "epoch": 0.04135985135228133, "grad_norm": 2.489717637429577, "kl": 0.396484375, "learning_rate": 9.957851067572394e-07, "loss": -0.0, "reward": 1.8690204620361328, "reward_std": 0.1349889636039734, "rewards/accuracy_reward": 0.712770402431488, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 601, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 477.828125, "epoch": 0.041428669740554674, "grad_norm": 1.222689376225662, "kl": 0.404296875, "learning_rate": 9.957710886429958e-07, "loss": -0.0, "reward": 2.4406251907348633, "reward_std": 0.1552342027425766, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.46875, "step": 602, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 478.015625, "epoch": 0.041497488128828024, "grad_norm": 0.0, "kl": 0.34765625, "learning_rate": 9.957570473553428e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 603, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 465.140625, "epoch": 0.04156630651710137, "grad_norm": 0.5499108561354246, "kl": 0.33984375, "learning_rate": 9.957429828949366e-07, "loss": -0.0, "reward": 1.8247780799865723, "reward_std": 0.00882986094802618, "rewards/accuracy_reward": 0.674778163433075, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 604, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 473.71875, "epoch": 0.041635124905374715, "grad_norm": 0.4246874592439647, "kl": 0.341796875, "learning_rate": 9.957288952624347e-07, "loss": -0.0, "reward": 2.616896152496338, "reward_std": 0.011233014985918999, "rewards/accuracy_reward": 0.9168959856033325, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 605, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 470.953125, "epoch": 0.041703943293648064, "grad_norm": 0.703912435268609, "kl": 0.3828125, "learning_rate": 9.957147844584957e-07, "loss": -0.0, "reward": 1.966309666633606, "reward_std": 0.16457077860832214, "rewards/accuracy_reward": 0.822559654712677, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 606, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 477.5625, "epoch": 0.04177276168192141, "grad_norm": 0.4866821202413581, "kl": 0.337890625, "learning_rate": 9.95700650483779e-07, "loss": 0.0, "reward": 2.3625001907348633, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 607, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 449.1875, "epoch": 0.041841580070194756, "grad_norm": 1.4880359873625184, "kl": 0.388671875, "learning_rate": 9.956864933389452e-07, "loss": -0.0, "reward": 2.0875000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 608, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 456.71875, "epoch": 0.041910398458468105, "grad_norm": 0.8711388328078022, "kl": 0.396484375, "learning_rate": 9.956723130246562e-07, "loss": 0.0, "reward": 2.06105375289917, "reward_std": 0.0200246199965477, "rewards/accuracy_reward": 0.8610537648200989, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 609, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 486.359375, "epoch": 0.04197921684674145, "grad_norm": 0.8347827534325984, "kl": 0.423828125, "learning_rate": 9.956581095415747e-07, "loss": 0.0, "reward": 1.911285400390625, "reward_std": 0.13978347182273865, "rewards/accuracy_reward": 0.7612853050231934, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 610, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 456.421875, "epoch": 0.0420480352350148, "grad_norm": 0.4943177844792087, "kl": 0.3828125, "learning_rate": 9.956438828903646e-07, "loss": 0.0, "reward": 2.106250047683716, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 611, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 490.1875, "epoch": 0.042116853623288146, "grad_norm": 1.0476529344614989, "kl": 0.412109375, "learning_rate": 9.95629633071691e-07, "loss": -0.0, "reward": 2.284853935241699, "reward_std": 0.27830955386161804, "rewards/accuracy_reward": 0.6629791259765625, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.5, "step": 612, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 464.53125, "epoch": 0.04218567201156149, "grad_norm": 0.0, "kl": 0.4375, "learning_rate": 9.956153600862198e-07, "loss": 0.0, "reward": 2.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 613, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 505.0625, "epoch": 0.04225449039983484, "grad_norm": 0.0, "kl": 0.33984375, "learning_rate": 9.956010639346185e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 614, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 479.03125, "epoch": 0.04232330878810818, "grad_norm": 0.9389178276740041, "kl": 0.408203125, "learning_rate": 9.955867446175548e-07, "loss": 0.0, "reward": 2.3439292907714844, "reward_std": 0.05085986107587814, "rewards/accuracy_reward": 0.6939290761947632, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 615, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 495.953125, "epoch": 0.04239212717638153, "grad_norm": 1.805885714532874, "kl": 0.33984375, "learning_rate": 9.955724021356986e-07, "loss": -0.0, "reward": 1.8406250476837158, "reward_std": 0.12437254190444946, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 616, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 450.03125, "epoch": 0.04246094556465488, "grad_norm": 0.9544298478887084, "kl": 0.453125, "learning_rate": 9.955580364897198e-07, "loss": 0.0, "reward": 2.3812499046325684, "reward_std": 0.2082977592945099, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 617, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 465.515625, "epoch": 0.04252976395292822, "grad_norm": 1.2094960169617668, "kl": 0.3515625, "learning_rate": 9.9554364768029e-07, "loss": 0.0, "reward": 2.325000047683716, "reward_std": 0.1306653916835785, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 618, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 450.3125, "epoch": 0.04259858234120157, "grad_norm": 0.8243478300666546, "kl": 0.42578125, "learning_rate": 9.95529235708082e-07, "loss": -0.0, "reward": 2.032233953475952, "reward_std": 0.16554827988147736, "rewards/accuracy_reward": 0.43223410844802856, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 619, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 477.4375, "epoch": 0.04266740072947492, "grad_norm": 1.021955493972751, "kl": 0.43359375, "learning_rate": 9.95514800573769e-07, "loss": 0.0, "reward": 1.8274006843566895, "reward_std": 0.14321748912334442, "rewards/accuracy_reward": 0.6774006485939026, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 620, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 464.765625, "epoch": 0.04273621911774826, "grad_norm": 1.1036275834022378, "kl": 0.38671875, "learning_rate": 9.955003422780263e-07, "loss": 0.0, "reward": 1.8123325109481812, "reward_std": 0.18755505979061127, "rewards/accuracy_reward": 0.6623325347900391, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 621, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 472.4375, "epoch": 0.04280503750602161, "grad_norm": 0.7720559167562409, "kl": 0.443359375, "learning_rate": 9.954858608215294e-07, "loss": 0.0, "reward": 1.7312500476837158, "reward_std": 0.1578107476234436, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 622, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 482.734375, "epoch": 0.04287385589429495, "grad_norm": 0.5742553620702683, "kl": 0.41796875, "learning_rate": 9.954713562049552e-07, "loss": 0.0, "reward": 2.4156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 623, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 482.75, "epoch": 0.0429426742825683, "grad_norm": 0.7100582162766856, "kl": 0.38671875, "learning_rate": 9.954568284289817e-07, "loss": -0.0, "reward": 2.042119026184082, "reward_std": 0.004715663846582174, "rewards/accuracy_reward": 0.8421189188957214, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 624, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 490.65625, "epoch": 0.04301149267084165, "grad_norm": 1.8372951460468103, "kl": 0.35546875, "learning_rate": 9.95442277494288e-07, "loss": 0.0, "reward": 2.0687499046325684, "reward_std": 0.20264789462089539, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 625, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 530.609375, "epoch": 0.043080311059114994, "grad_norm": 1.172826747889312, "kl": 0.333984375, "learning_rate": 9.954277034015542e-07, "loss": -0.0, "reward": 2.3289237022399902, "reward_std": 0.25901520252227783, "rewards/accuracy_reward": 0.6726735830307007, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 626, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 501.28125, "epoch": 0.04314912944738834, "grad_norm": 1.2118861555276579, "kl": 0.361328125, "learning_rate": 9.954131061514615e-07, "loss": -0.0, "reward": 2.4562501907348633, "reward_std": 0.2379891276359558, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 627, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 499.296875, "epoch": 0.04321794783566169, "grad_norm": 1.0135984332197232, "kl": 0.353515625, "learning_rate": 9.953984857446922e-07, "loss": -0.0, "reward": 2.2421875, "reward_std": 0.08476267009973526, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 628, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 476.546875, "epoch": 0.043286766223935035, "grad_norm": 0.449014125120951, "kl": 0.4140625, "learning_rate": 9.953838421819296e-07, "loss": 0.0, "reward": 2.1812500953674316, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 629, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 479.375, "epoch": 0.043355584612208384, "grad_norm": 0.6382845608066104, "kl": 0.37109375, "learning_rate": 9.953691754638585e-07, "loss": 0.0, "reward": 2.301971673965454, "reward_std": 0.03336859866976738, "rewards/accuracy_reward": 0.6550967693328857, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 630, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 487.53125, "epoch": 0.043424403000481726, "grad_norm": 0.9661988379092932, "kl": 0.4609375, "learning_rate": 9.953544855911642e-07, "loss": -0.0, "reward": 2.4312500953674316, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 631, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 475.21875, "epoch": 0.043493221388755075, "grad_norm": 0.0, "kl": 0.3671875, "learning_rate": 9.953397725645334e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 632, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 448.484375, "epoch": 0.043562039777028425, "grad_norm": 0.0, "kl": 0.42578125, "learning_rate": 9.953250363846538e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 633, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 476.1875, "epoch": 0.04363085816530177, "grad_norm": 1.0375620727756203, "kl": 0.373046875, "learning_rate": 9.953102770522142e-07, "loss": -0.0, "reward": 1.9300956726074219, "reward_std": 0.06854359805583954, "rewards/accuracy_reward": 0.7582207322120667, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 634, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 525.84375, "epoch": 0.043699676553575116, "grad_norm": 1.2162029050630663, "kl": 0.40234375, "learning_rate": 9.952954945679044e-07, "loss": 0.0, "reward": 2.315056324005127, "reward_std": 0.1650308221578598, "rewards/accuracy_reward": 0.6806814074516296, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.453125, "step": 635, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 480.125, "epoch": 0.043768494941848465, "grad_norm": 0.46083107439340204, "kl": 0.4921875, "learning_rate": 9.952806889324154e-07, "loss": -0.0, "reward": 2.21875, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 636, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 470.59375, "epoch": 0.04383731333012181, "grad_norm": 0.5150826547537973, "kl": 0.45703125, "learning_rate": 9.952658601464396e-07, "loss": 0.0, "reward": 2.1656250953674316, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 637, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 493.203125, "epoch": 0.04390613171839516, "grad_norm": 0.9003002341021236, "kl": 0.423828125, "learning_rate": 9.952510082106696e-07, "loss": 0.0, "reward": 2.1365082263946533, "reward_std": 0.12223517149686813, "rewards/accuracy_reward": 0.538070797920227, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 638, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 506.3125, "epoch": 0.0439749501066685, "grad_norm": 0.7843727404169757, "kl": 0.35546875, "learning_rate": 9.952361331258e-07, "loss": 0.0, "reward": 2.143749952316284, "reward_std": 0.12246951460838318, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 639, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 458.984375, "epoch": 0.04404376849494185, "grad_norm": 1.0768160286450863, "kl": 0.462890625, "learning_rate": 9.952212348925258e-07, "loss": 0.0, "reward": 1.9718749523162842, "reward_std": 0.17740556597709656, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 640, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 493.453125, "epoch": 0.0441125868832152, "grad_norm": 0.7152672623936929, "kl": 0.462890625, "learning_rate": 9.952063135115435e-07, "loss": 0.0, "reward": 2.140622615814209, "reward_std": 0.06841787695884705, "rewards/accuracy_reward": 0.5187478065490723, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 641, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 534.078125, "epoch": 0.04418140527148854, "grad_norm": 2.117006624843916, "kl": 0.33203125, "learning_rate": 9.951913689835506e-07, "loss": 0.0, "reward": 2.1748156547546387, "reward_std": 0.46769461035728455, "rewards/accuracy_reward": 0.5779409408569336, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.484375, "step": 642, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 489.3125, "epoch": 0.04425022365976189, "grad_norm": 1.1348955162783652, "kl": 0.458984375, "learning_rate": 9.951764013092457e-07, "loss": -0.0, "reward": 1.931249976158142, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 643, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 460.3125, "epoch": 0.04431904204803524, "grad_norm": 2.1883285735324245, "kl": 0.458984375, "learning_rate": 9.951614104893282e-07, "loss": -0.0, "reward": 2.2011191844940186, "reward_std": 0.22875265777111053, "rewards/accuracy_reward": 0.5698692202568054, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 644, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 519.828125, "epoch": 0.04438786043630858, "grad_norm": 1.6669071686650359, "kl": 0.44140625, "learning_rate": 9.951463965244989e-07, "loss": 0.0, "reward": 2.4383463859558105, "reward_std": 0.1269039362668991, "rewards/accuracy_reward": 0.785221517086029, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 645, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 489.953125, "epoch": 0.04445667882458193, "grad_norm": 0.4852036401028107, "kl": 0.474609375, "learning_rate": 9.951313594154597e-07, "loss": 0.0, "reward": 2.106250047683716, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 646, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 499.484375, "epoch": 0.04452549721285527, "grad_norm": 0.7973253016620636, "kl": 0.451171875, "learning_rate": 9.951162991629134e-07, "loss": -0.0, "reward": 1.8847408294677734, "reward_std": 0.08256709575653076, "rewards/accuracy_reward": 0.7253657579421997, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 647, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 493.0625, "epoch": 0.04459431560112862, "grad_norm": 1.7992370644008455, "kl": 0.390625, "learning_rate": 9.951012157675638e-07, "loss": -0.0, "reward": 2.171875, "reward_std": 0.12182654440402985, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 648, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 431.5625, "epoch": 0.04466313398940197, "grad_norm": 2.0914713663595497, "kl": 0.478515625, "learning_rate": 9.950861092301161e-07, "loss": 0.0, "reward": 1.9375, "reward_std": 0.1306653916835785, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 649, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 464.71875, "epoch": 0.04473195237767531, "grad_norm": 0.9987394488070427, "kl": 0.421875, "learning_rate": 9.950709795512767e-07, "loss": 0.0, "reward": 2.1531505584716797, "reward_std": 0.06392693519592285, "rewards/accuracy_reward": 0.956275463104248, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 650, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 442.296875, "epoch": 0.04480077076594866, "grad_norm": 0.0, "kl": 0.46484375, "learning_rate": 9.95055826731752e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 651, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 460.03125, "epoch": 0.04486958915422201, "grad_norm": 0.815772481121584, "kl": 0.4453125, "learning_rate": 9.95040650772251e-07, "loss": 0.0, "reward": 2.4018020629882812, "reward_std": 0.098207026720047, "rewards/accuracy_reward": 0.739301860332489, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 652, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 480.671875, "epoch": 0.044938407542495354, "grad_norm": 0.7155606092767077, "kl": 0.3984375, "learning_rate": 9.950254516734828e-07, "loss": -0.0, "reward": 2.3010172843933105, "reward_std": 0.17681622505187988, "rewards/accuracy_reward": 0.6978924870491028, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.5, "step": 653, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 419.1875, "epoch": 0.0450072259307687, "grad_norm": 2.0761439557636425, "kl": 0.51953125, "learning_rate": 9.950102294361577e-07, "loss": -0.0, "reward": 2.3931567668914795, "reward_std": 0.1660708785057068, "rewards/accuracy_reward": 0.7931567430496216, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.421875, "step": 654, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 468.875, "epoch": 0.045076044319042045, "grad_norm": 1.228220835970478, "kl": 0.439453125, "learning_rate": 9.949949840609874e-07, "loss": 0.0, "reward": 1.9161797761917114, "reward_std": 0.2162414938211441, "rewards/accuracy_reward": 0.7661796808242798, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 655, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 448.09375, "epoch": 0.045144862707315395, "grad_norm": 7.472770137880926, "kl": 0.5546875, "learning_rate": 9.949797155486847e-07, "loss": -0.0, "reward": 2.582812547683716, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3828125, "step": 656, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 465.75, "epoch": 0.045213681095588744, "grad_norm": 0.6963822775165966, "kl": 0.515625, "learning_rate": 9.949644238999628e-07, "loss": -0.0, "reward": 2.5194647312164307, "reward_std": 0.08634960651397705, "rewards/accuracy_reward": 0.8382147550582886, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 657, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 430.140625, "epoch": 0.045282499483862086, "grad_norm": 2.9132600452377986, "kl": 0.5234375, "learning_rate": 9.94949109115537e-07, "loss": 0.0, "reward": 1.8028132915496826, "reward_std": 0.34073811769485474, "rewards/accuracy_reward": 0.6621881723403931, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 658, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 451.0625, "epoch": 0.045351317872135435, "grad_norm": 1.0301115712858806, "kl": 0.494140625, "learning_rate": 9.949337711961224e-07, "loss": -0.0, "reward": 2.250035047531128, "reward_std": 0.1263747662305832, "rewards/accuracy_reward": 0.6156601905822754, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 659, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 455.5625, "epoch": 0.045420136260408785, "grad_norm": 3.077991483261165, "kl": 0.51953125, "learning_rate": 9.949184101424365e-07, "loss": 0.0, "reward": 1.8939940929412842, "reward_std": 0.06459740549325943, "rewards/accuracy_reward": 0.722119152545929, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 660, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 420.734375, "epoch": 0.04548895464868213, "grad_norm": 2.7002990643459763, "kl": 0.58203125, "learning_rate": 9.949030259551974e-07, "loss": 0.0, "reward": 2.4502882957458496, "reward_std": 0.20165200531482697, "rewards/accuracy_reward": 0.862788200378418, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.421875, "step": 661, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 399.984375, "epoch": 0.045557773036955476, "grad_norm": 0.7572543129284476, "kl": 0.498046875, "learning_rate": 9.94887618635124e-07, "loss": 0.0, "reward": 2.543750047683716, "reward_std": 0.011572758667171001, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 662, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 398.3125, "epoch": 0.04562659142522882, "grad_norm": 0.6058658103284857, "kl": 0.5, "learning_rate": 9.948721881829364e-07, "loss": 0.0, "reward": 2.046875, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 663, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 382.6875, "epoch": 0.04569540981350217, "grad_norm": 3.1565364482212503, "kl": 0.49609375, "learning_rate": 9.948567345993556e-07, "loss": -0.0, "reward": 2.4476141929626465, "reward_std": 0.22150473296642303, "rewards/accuracy_reward": 0.8351142406463623, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 664, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 372.484375, "epoch": 0.04576422820177552, "grad_norm": 1.590516277994185, "kl": 0.55859375, "learning_rate": 9.948412578851046e-07, "loss": 0.0, "reward": 2.4283056259155273, "reward_std": 0.1959233283996582, "rewards/accuracy_reward": 0.7970556020736694, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 665, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 421.921875, "epoch": 0.04583304659004886, "grad_norm": 1.3604182923060766, "kl": 0.55859375, "learning_rate": 9.948257580409064e-07, "loss": 0.0, "reward": 2.3937501907348633, "reward_std": 0.011572758667171001, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 666, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 461.0625, "epoch": 0.04590186497832221, "grad_norm": 0.9136288460445323, "kl": 0.51953125, "learning_rate": 9.948102350674854e-07, "loss": -0.0, "reward": 2.276163339614868, "reward_std": 0.09345937520265579, "rewards/accuracy_reward": 0.663663387298584, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.5, "step": 667, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 422.34375, "epoch": 0.04597068336659556, "grad_norm": 0.5937938813666894, "kl": 0.4921875, "learning_rate": 9.947946889655676e-07, "loss": 0.0, "reward": 2.1624999046325684, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 668, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 427.3125, "epoch": 0.0460395017548689, "grad_norm": 1.8730806048147215, "kl": 0.494140625, "learning_rate": 9.94779119735879e-07, "loss": -0.0, "reward": 2.003124952316284, "reward_std": 0.11363068222999573, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 669, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 421.453125, "epoch": 0.04610832014314225, "grad_norm": 0.5666756469715079, "kl": 0.5, "learning_rate": 9.94763527379148e-07, "loss": 0.0, "reward": 1.9557924270629883, "reward_std": 0.0024790556635707617, "rewards/accuracy_reward": 0.7807923555374146, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 670, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 463.921875, "epoch": 0.04617713853141559, "grad_norm": 1.4261435602188344, "kl": 0.578125, "learning_rate": 9.94747911896103e-07, "loss": 0.0, "reward": 2.234375, "reward_std": 0.17969031631946564, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 671, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 446.75, "epoch": 0.04624595691968894, "grad_norm": 0.5626985749646677, "kl": 0.59375, "learning_rate": 9.947322732874744e-07, "loss": -0.0, "reward": 2.5562500953674316, "reward_std": 0.0770551785826683, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.46875, "step": 672, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 522.21875, "epoch": 0.04631477530796229, "grad_norm": 0.8696564603729515, "kl": 0.53515625, "learning_rate": 9.947166115539924e-07, "loss": 0.0, "reward": 1.9748821258544922, "reward_std": 0.22191712260246277, "rewards/accuracy_reward": 0.8280071020126343, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 673, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 486.078125, "epoch": 0.04638359369623563, "grad_norm": 0.47389411017363225, "kl": 0.48828125, "learning_rate": 9.947009266963898e-07, "loss": 0.0, "reward": 2.624990463256836, "reward_std": 0.009516062214970589, "rewards/accuracy_reward": 0.9249904155731201, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 674, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 510.484375, "epoch": 0.04645241208450898, "grad_norm": 1.078115452115117, "kl": 0.53515625, "learning_rate": 9.94685218715399e-07, "loss": -0.0, "reward": 2.308854103088379, "reward_std": 0.08499807119369507, "rewards/accuracy_reward": 0.6744791269302368, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 675, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 513.703125, "epoch": 0.04652123047278233, "grad_norm": 1.1074830889037401, "kl": 0.55078125, "learning_rate": 9.94669487611755e-07, "loss": -0.0, "reward": 2.1134371757507324, "reward_std": 0.2590653896331787, "rewards/accuracy_reward": 0.6603121757507324, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.375, "step": 676, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 484.046875, "epoch": 0.04659004886105567, "grad_norm": 0.9830757115659265, "kl": 0.51953125, "learning_rate": 9.946537333861928e-07, "loss": -0.0, "reward": 2.3248512744903564, "reward_std": 0.0847342386841774, "rewards/accuracy_reward": 0.6717262268066406, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 677, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 496.328125, "epoch": 0.04665886724932902, "grad_norm": 0.9653426628398223, "kl": 0.55859375, "learning_rate": 9.946379560394487e-07, "loss": -0.0, "reward": 1.9156250953674316, "reward_std": 0.183067187666893, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 678, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 507.078125, "epoch": 0.046727685637602365, "grad_norm": 1.0740626969438407, "kl": 0.55078125, "learning_rate": 9.946221555722603e-07, "loss": 0.0, "reward": 1.9187500476837158, "reward_std": 0.12246951460838318, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 679, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 509.21875, "epoch": 0.046796504025875714, "grad_norm": 1.9512030537334568, "kl": 0.546875, "learning_rate": 9.946063319853659e-07, "loss": 0.0, "reward": 2.03125, "reward_std": 0.2082977592945099, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 680, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 497.65625, "epoch": 0.04686532241414906, "grad_norm": 18.49946955841979, "kl": 0.51171875, "learning_rate": 9.945904852795053e-07, "loss": 0.0, "reward": 1.9787847995758057, "reward_std": 0.00343959778547287, "rewards/accuracy_reward": 0.8037847280502319, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 681, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 532.78125, "epoch": 0.046934140802422405, "grad_norm": 0.9754900638058251, "kl": 0.55078125, "learning_rate": 9.945746154554192e-07, "loss": -0.0, "reward": 2.0531249046325684, "reward_std": 0.16666369140148163, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 682, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 554.375, "epoch": 0.047002959190695755, "grad_norm": 0.8007046913905219, "kl": 0.50390625, "learning_rate": 9.945587225138495e-07, "loss": 0.0, "reward": 1.936568260192871, "reward_std": 0.25237250328063965, "rewards/accuracy_reward": 0.8115681409835815, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.0, "step": 683, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 479.203125, "epoch": 0.047071777578969104, "grad_norm": 0.8357111083496447, "kl": 0.5234375, "learning_rate": 9.94542806455539e-07, "loss": 0.0, "reward": 2.1374998092651367, "reward_std": 0.1306653916835785, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 684, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 495.265625, "epoch": 0.047140595967242446, "grad_norm": 1.1773143939777877, "kl": 0.54296875, "learning_rate": 9.945268672812316e-07, "loss": 0.0, "reward": 2.3873980045318604, "reward_std": 0.12566930055618286, "rewards/accuracy_reward": 0.7280229330062866, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 685, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 495.921875, "epoch": 0.047209414355515796, "grad_norm": 0.0, "kl": 0.59765625, "learning_rate": 9.945109049916722e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 686, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 497.84375, "epoch": 0.04727823274378914, "grad_norm": 1.0923823794259706, "kl": 0.53125, "learning_rate": 9.944949195876071e-07, "loss": -0.0, "reward": 2.4289731979370117, "reward_std": 0.07586538046598434, "rewards/accuracy_reward": 0.7602230310440063, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 687, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 490.375, "epoch": 0.04734705113206249, "grad_norm": 0.7855053271738723, "kl": 0.609375, "learning_rate": 9.944789110697835e-07, "loss": 0.0, "reward": 1.8546874523162842, "reward_std": 0.10227546095848083, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1796875, "step": 688, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 478.984375, "epoch": 0.047415869520335836, "grad_norm": 0.8789773136823081, "kl": 0.58203125, "learning_rate": 9.944628794389497e-07, "loss": 0.0, "reward": 2.4734957218170166, "reward_std": 0.15270110964775085, "rewards/accuracy_reward": 0.8047456741333008, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 689, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 491.109375, "epoch": 0.04748468790860918, "grad_norm": 2.4391545779976314, "kl": 0.64453125, "learning_rate": 9.94446824695855e-07, "loss": 0.0, "reward": 2.385937452316284, "reward_std": 0.15530847012996674, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4296875, "step": 690, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 529.109375, "epoch": 0.04755350629688253, "grad_norm": 1.048900318569185, "kl": 0.5546875, "learning_rate": 9.944307468412495e-07, "loss": -0.0, "reward": 1.7519235610961914, "reward_std": 0.23699849843978882, "rewards/accuracy_reward": 0.6425484418869019, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 691, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 497.6875, "epoch": 0.04762232468515588, "grad_norm": 0.8973508806699786, "kl": 0.5625, "learning_rate": 9.944146458758855e-07, "loss": 0.0, "reward": 1.6483969688415527, "reward_std": 0.24409019947052002, "rewards/accuracy_reward": 0.570271909236908, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.0, "step": 692, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 478.921875, "epoch": 0.04769114307342922, "grad_norm": 1.9804529886607793, "kl": 0.609375, "learning_rate": 9.94398521800515e-07, "loss": -0.0, "reward": 2.0625, "reward_std": 0.07312604784965515, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.296875, "step": 693, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 493.875, "epoch": 0.04775996146170257, "grad_norm": 0.49947017100456786, "kl": 0.58203125, "learning_rate": 9.943823746158914e-07, "loss": -0.0, "reward": 2.3358139991760254, "reward_std": 0.0650567039847374, "rewards/accuracy_reward": 0.9045637845993042, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.25, "step": 694, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 477.25, "epoch": 0.04782877984997591, "grad_norm": 0.8032110368959716, "kl": 0.61328125, "learning_rate": 9.943662043227701e-07, "loss": 0.0, "reward": 1.8218750953674316, "reward_std": 0.12182654440402985, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 695, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 434.296875, "epoch": 0.04789759823824926, "grad_norm": 2.054685986530576, "kl": 0.59765625, "learning_rate": 9.943500109219067e-07, "loss": -0.0, "reward": 2.372471570968628, "reward_std": 0.06577269732952118, "rewards/accuracy_reward": 0.7224715948104858, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 696, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 470.328125, "epoch": 0.04796641662652261, "grad_norm": 0.4838286752344936, "kl": 0.625, "learning_rate": 9.943337944140582e-07, "loss": 0.0, "reward": 2.3812499046325684, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 697, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 468.578125, "epoch": 0.04803523501479595, "grad_norm": 0.9191801420150821, "kl": 0.62109375, "learning_rate": 9.943175547999825e-07, "loss": -0.0, "reward": 1.9798121452331543, "reward_std": 0.05435940623283386, "rewards/accuracy_reward": 0.8048121333122253, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 698, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 433.3125, "epoch": 0.0481040534030693, "grad_norm": 7.180733860222745, "kl": 0.5859375, "learning_rate": 9.943012920804385e-07, "loss": -0.0, "reward": 2.3300466537475586, "reward_std": 0.024814175441861153, "rewards/accuracy_reward": 0.680046558380127, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 699, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 445.03125, "epoch": 0.04817287179134265, "grad_norm": 0.8340629366105765, "kl": 0.64453125, "learning_rate": 9.942850062561866e-07, "loss": 0.0, "reward": 2.393749952316284, "reward_std": 0.1332113891839981, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 700, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 421.1875, "epoch": 0.04824169017961599, "grad_norm": 0.45317784429226743, "kl": 0.60546875, "learning_rate": 9.942686973279879e-07, "loss": -0.0, "reward": 2.1437501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 701, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 444.53125, "epoch": 0.04831050856788934, "grad_norm": 0.0, "kl": 0.625, "learning_rate": 9.942523652966047e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 702, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 469.921875, "epoch": 0.048379326956162684, "grad_norm": 1.5278449927709508, "kl": 0.59375, "learning_rate": 9.942360101628007e-07, "loss": 0.0, "reward": 2.210355043411255, "reward_std": 0.13271309435367584, "rewards/accuracy_reward": 0.5791050791740417, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 703, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 417.140625, "epoch": 0.04844814534443603, "grad_norm": 0.46341849905480503, "kl": 0.62890625, "learning_rate": 9.942196319273398e-07, "loss": -0.0, "reward": 1.9862287044525146, "reward_std": 0.03783169388771057, "rewards/accuracy_reward": 0.8143536448478699, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 704, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 453.921875, "epoch": 0.04851696373270938, "grad_norm": 0.8279879306885662, "kl": 0.61328125, "learning_rate": 9.942032305909882e-07, "loss": 0.0, "reward": 1.8250000476837158, "reward_std": 0.1306653916835785, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 705, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 464.1875, "epoch": 0.048585782120982725, "grad_norm": 0.52557490522288, "kl": 0.66796875, "learning_rate": 9.941868061545123e-07, "loss": -0.0, "reward": 2.2250001430511475, "reward_std": 0.08017838001251221, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.25, "step": 706, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 428.921875, "epoch": 0.048654600509256074, "grad_norm": 2.1667154198288348, "kl": 0.609375, "learning_rate": 9.941703586186794e-07, "loss": 0.0, "reward": 2.6624999046325684, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 707, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 411.140625, "epoch": 0.04872341889752942, "grad_norm": 0.0, "kl": 0.625, "learning_rate": 9.94153887984259e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 708, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 514.78125, "epoch": 0.048792237285802766, "grad_norm": 0.9176127422294611, "kl": 0.5546875, "learning_rate": 9.9413739425202e-07, "loss": -0.0, "reward": 2.030951499938965, "reward_std": 0.060178451240062714, "rewards/accuracy_reward": 0.8340765833854675, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 709, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 453.265625, "epoch": 0.048861055674076115, "grad_norm": 0.628920175536416, "kl": 0.609375, "learning_rate": 9.941208774227346e-07, "loss": -0.0, "reward": 1.845396876335144, "reward_std": 0.07687247544527054, "rewards/accuracy_reward": 0.6891468167304993, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 710, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 444.875, "epoch": 0.04892987406234946, "grad_norm": 0.6931026481853906, "kl": 0.6796875, "learning_rate": 9.94104337497174e-07, "loss": 0.0, "reward": 2.2718749046325684, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 711, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 402.625, "epoch": 0.048998692450622806, "grad_norm": 1.581821204514512, "kl": 0.62109375, "learning_rate": 9.940877744761113e-07, "loss": 0.0, "reward": 2.405855417251587, "reward_std": 0.17597809433937073, "rewards/accuracy_reward": 0.7277302742004395, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 712, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 454.484375, "epoch": 0.049067510838896156, "grad_norm": 0.4733135422379607, "kl": 0.609375, "learning_rate": 9.94071188360321e-07, "loss": -0.0, "reward": 2.6437501907348633, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 713, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 504.46875, "epoch": 0.0491363292271695, "grad_norm": 3.364653035333596, "kl": 0.58984375, "learning_rate": 9.940545791505783e-07, "loss": -0.0, "reward": 2.330533027648926, "reward_std": 0.05732296034693718, "rewards/accuracy_reward": 0.8336581587791443, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.296875, "step": 714, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 442.734375, "epoch": 0.04920514761544285, "grad_norm": 0.7190346852177686, "kl": 0.609375, "learning_rate": 9.940379468476597e-07, "loss": 0.0, "reward": 2.124751567840576, "reward_std": 0.011302271857857704, "rewards/accuracy_reward": 0.9278764724731445, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 715, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 479.34375, "epoch": 0.049273966003716196, "grad_norm": 0.0, "kl": 0.6484375, "learning_rate": 9.940212914523422e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 716, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 475.78125, "epoch": 0.04934278439198954, "grad_norm": 1.0075293928383564, "kl": 0.65625, "learning_rate": 9.940046129654047e-07, "loss": 0.0, "reward": 1.881250023841858, "reward_std": 0.19190602004528046, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 717, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 479.09375, "epoch": 0.04941160278026289, "grad_norm": 0.5744019905613003, "kl": 0.59375, "learning_rate": 9.939879113876264e-07, "loss": 0.0, "reward": 2.094925880432129, "reward_std": 0.06883338838815689, "rewards/accuracy_reward": 0.9136759042739868, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 718, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 451.25, "epoch": 0.04948042116853623, "grad_norm": 0.5121208413185065, "kl": 0.609375, "learning_rate": 9.939711867197885e-07, "loss": -0.0, "reward": 2.065624952316284, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 719, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 459.125, "epoch": 0.04954923955680958, "grad_norm": 1.0004700990779956, "kl": 0.58203125, "learning_rate": 9.939544389626722e-07, "loss": -0.0, "reward": 2.3375000953674316, "reward_std": 0.1660207211971283, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 720, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 465.375, "epoch": 0.04961805794508293, "grad_norm": 1.0527919862275181, "kl": 0.59375, "learning_rate": 9.939376681170608e-07, "loss": -0.0, "reward": 1.984697937965393, "reward_std": 0.048356667160987854, "rewards/accuracy_reward": 0.8096978664398193, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 721, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 510.109375, "epoch": 0.04968687633335627, "grad_norm": 0.9175368609233439, "kl": 0.55859375, "learning_rate": 9.93920874183738e-07, "loss": -0.0, "reward": 1.8156094551086426, "reward_std": 0.114982008934021, "rewards/accuracy_reward": 0.6843595504760742, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 722, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 494.265625, "epoch": 0.04975569472162962, "grad_norm": 0.9757928994792762, "kl": 0.62109375, "learning_rate": 9.939040571634885e-07, "loss": 0.0, "reward": 2.4671874046325684, "reward_std": 0.15276247262954712, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 723, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 473.90625, "epoch": 0.04982451310990297, "grad_norm": 0.40605671119687853, "kl": 0.5625, "learning_rate": 9.93887217057099e-07, "loss": 0.0, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 724, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 485.390625, "epoch": 0.04989333149817631, "grad_norm": 0.534538411651468, "kl": 0.57421875, "learning_rate": 9.93870353865356e-07, "loss": 0.0, "reward": 1.94205904006958, "reward_std": 0.06764605641365051, "rewards/accuracy_reward": 0.7858090400695801, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 725, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 507.40625, "epoch": 0.04996214988644966, "grad_norm": 0.0, "kl": 0.6015625, "learning_rate": 9.93853467589048e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 726, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 491.734375, "epoch": 0.050030968274723, "grad_norm": 0.5956456850396764, "kl": 0.5546875, "learning_rate": 9.938365582289644e-07, "loss": -0.0, "reward": 1.9026916027069092, "reward_std": 0.004836547188460827, "rewards/accuracy_reward": 0.7276915311813354, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 727, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 502.828125, "epoch": 0.05009978666299635, "grad_norm": 0.46342914176361005, "kl": 0.6328125, "learning_rate": 9.938196257858954e-07, "loss": 0.0, "reward": 2.621875047683716, "reward_std": 0.0289318785071373, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.421875, "step": 728, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 469.890625, "epoch": 0.0501686050512697, "grad_norm": 1.1942412028173925, "kl": 0.58984375, "learning_rate": 9.938026702606327e-07, "loss": 0.0, "reward": 2.601872205734253, "reward_std": 0.07241600751876831, "rewards/accuracy_reward": 0.9049972295761108, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 729, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 516.140625, "epoch": 0.050237423439543044, "grad_norm": 0.8017145929574151, "kl": 0.58984375, "learning_rate": 9.937856916539687e-07, "loss": -0.0, "reward": 1.9594979286193848, "reward_std": 0.11255812644958496, "rewards/accuracy_reward": 0.8032479286193848, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 730, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 514.71875, "epoch": 0.05030624182781639, "grad_norm": 0.9583493961291485, "kl": 0.59375, "learning_rate": 9.937686899666968e-07, "loss": 0.0, "reward": 2.254134178161621, "reward_std": 0.209119513630867, "rewards/accuracy_reward": 0.7525715827941895, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.3515625, "step": 731, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 495.296875, "epoch": 0.05037506021608974, "grad_norm": 0.6029519703346087, "kl": 0.5859375, "learning_rate": 9.93751665199612e-07, "loss": 0.0, "reward": 2.34375, "reward_std": 0.12246952950954437, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 732, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 529.84375, "epoch": 0.050443878604363085, "grad_norm": 0.574032837746648, "kl": 0.58984375, "learning_rate": 9.937346173535098e-07, "loss": 0.0, "reward": 2.4763784408569336, "reward_std": 0.05778389424085617, "rewards/accuracy_reward": 0.904503583908081, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 733, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 523.796875, "epoch": 0.050512696992636434, "grad_norm": 0.0, "kl": 0.60546875, "learning_rate": 9.937175464291876e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 734, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 506.96875, "epoch": 0.050581515380909776, "grad_norm": 0.7128927676682507, "kl": 0.59375, "learning_rate": 9.937004524274425e-07, "loss": 0.0, "reward": 1.901912808418274, "reward_std": 0.10334526747465134, "rewards/accuracy_reward": 0.7394126653671265, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 735, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 509.3125, "epoch": 0.050650333769183126, "grad_norm": 0.5526743787381906, "kl": 0.625, "learning_rate": 9.936833353490743e-07, "loss": 0.0, "reward": 2.1161530017852783, "reward_std": 0.006135357543826103, "rewards/accuracy_reward": 0.9161529541015625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 736, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 535.296875, "epoch": 0.050719152157456475, "grad_norm": 0.7489243562016707, "kl": 0.58984375, "learning_rate": 9.936661951948828e-07, "loss": -0.0, "reward": 1.805648684501648, "reward_std": 0.15791140496730804, "rewards/accuracy_reward": 0.6868986487388611, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 737, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 528.46875, "epoch": 0.05078797054572982, "grad_norm": 0.0, "kl": 0.60546875, "learning_rate": 9.936490319656688e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 738, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 485.03125, "epoch": 0.050856788934003166, "grad_norm": 0.6659826891965398, "kl": 0.62890625, "learning_rate": 9.93631845662235e-07, "loss": 0.0, "reward": 2.528125047683716, "reward_std": 0.11363068222999573, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 739, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 485.125, "epoch": 0.050925607322276516, "grad_norm": 0.5587184459207643, "kl": 0.59375, "learning_rate": 9.936146362853847e-07, "loss": 0.0, "reward": 2.512500047683716, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 740, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 490.671875, "epoch": 0.05099442571054986, "grad_norm": 0.4772521697427735, "kl": 0.59375, "learning_rate": 9.93597403835922e-07, "loss": -0.0, "reward": 2.487802028656006, "reward_std": 0.06926105916500092, "rewards/accuracy_reward": 0.8065521121025085, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 741, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 483.421875, "epoch": 0.05106324409882321, "grad_norm": 0.6129063227080362, "kl": 0.58203125, "learning_rate": 9.935801483146527e-07, "loss": 0.0, "reward": 2.3860936164855957, "reward_std": 0.0830213874578476, "rewards/accuracy_reward": 0.726718544960022, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 742, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 543.65625, "epoch": 0.05113206248709655, "grad_norm": 0.0, "kl": 0.58984375, "learning_rate": 9.935628697223831e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 743, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 473.1875, "epoch": 0.0512008808753699, "grad_norm": 1.078669928952522, "kl": 0.60546875, "learning_rate": 9.935455680599212e-07, "loss": 0.0, "reward": 2.463804244995117, "reward_std": 0.03667999058961868, "rewards/accuracy_reward": 0.7638042569160461, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 744, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 530.9375, "epoch": 0.05126969926364325, "grad_norm": 0.6073069640729485, "kl": 0.61328125, "learning_rate": 9.935282433280753e-07, "loss": 0.0, "reward": 2.081167221069336, "reward_std": 0.0821099579334259, "rewards/accuracy_reward": 0.8905420899391174, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 745, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 502.359375, "epoch": 0.05133851765191659, "grad_norm": 0.0, "kl": 0.59375, "learning_rate": 9.935108955276555e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 746, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 504.984375, "epoch": 0.05140733604018994, "grad_norm": 0.7593572780875466, "kl": 0.60546875, "learning_rate": 9.934935246594727e-07, "loss": 0.0, "reward": 1.9750001430511475, "reward_std": 0.08017838001251221, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 747, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 513.484375, "epoch": 0.05147615442846329, "grad_norm": 0.885007533175104, "kl": 0.5703125, "learning_rate": 9.934761307243384e-07, "loss": -0.0, "reward": 1.9290462732315063, "reward_std": 0.015294017270207405, "rewards/accuracy_reward": 0.7540462017059326, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 748, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 509.609375, "epoch": 0.05154497281673663, "grad_norm": 1.0433851497536866, "kl": 0.5625, "learning_rate": 9.93458713723066e-07, "loss": 0.0, "reward": 1.936495065689087, "reward_std": 0.11368320882320404, "rewards/accuracy_reward": 0.7677451372146606, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 749, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 501.734375, "epoch": 0.05161379120500998, "grad_norm": 1.3297351353418696, "kl": 0.61328125, "learning_rate": 9.934412736564696e-07, "loss": 0.0, "reward": 2.625, "reward_std": 0.1306653916835785, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 750, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 543.625, "epoch": 0.05168260959328332, "grad_norm": 0.8663053214154426, "kl": 0.59375, "learning_rate": 9.934238105253644e-07, "loss": 0.0, "reward": 1.8715821504592896, "reward_std": 0.23871177434921265, "rewards/accuracy_reward": 0.7278320789337158, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 751, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 554.796875, "epoch": 0.05175142798155667, "grad_norm": 0.700569337621427, "kl": 0.59375, "learning_rate": 9.934063243305666e-07, "loss": 0.0, "reward": 1.6906249523162842, "reward_std": 0.12437255680561066, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 752, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 538.75, "epoch": 0.05182024636983002, "grad_norm": 1.0230116874647588, "kl": 0.59375, "learning_rate": 9.933888150728936e-07, "loss": -0.0, "reward": 2.176356077194214, "reward_std": 0.19628570973873138, "rewards/accuracy_reward": 0.5544810891151428, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 753, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 548.140625, "epoch": 0.05188906475810336, "grad_norm": 0.0, "kl": 0.59765625, "learning_rate": 9.933712827531636e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 754, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 519.640625, "epoch": 0.05195788314637671, "grad_norm": 0.6735475269405018, "kl": 0.6484375, "learning_rate": 9.933537273721963e-07, "loss": -0.0, "reward": 1.8759596347808838, "reward_std": 0.08596108108758926, "rewards/accuracy_reward": 0.7197096943855286, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 755, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 563.828125, "epoch": 0.05202670153465006, "grad_norm": 0.45251940772578675, "kl": 0.59375, "learning_rate": 9.933361489308123e-07, "loss": 0.0, "reward": 1.975000023841858, "reward_std": 0.08017838001251221, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 756, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 504.65625, "epoch": 0.052095519922923404, "grad_norm": 1.6786201741494087, "kl": 0.671875, "learning_rate": 9.933185474298331e-07, "loss": 0.0, "reward": 2.265625, "reward_std": 0.0819648876786232, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.234375, "step": 757, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 504.375, "epoch": 0.05216433831119675, "grad_norm": 0.8800076563677183, "kl": 0.69140625, "learning_rate": 9.933009228700814e-07, "loss": -0.0, "reward": 2.044125556945801, "reward_std": 0.10311782360076904, "rewards/accuracy_reward": 0.8503754734992981, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 758, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 534.265625, "epoch": 0.052233156699470096, "grad_norm": 0.5708464489082936, "kl": 0.63671875, "learning_rate": 9.932832752523814e-07, "loss": -0.0, "reward": 2.2545173168182373, "reward_std": 0.005215955898165703, "rewards/accuracy_reward": 0.6670172810554504, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 759, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 530.875, "epoch": 0.052301975087743445, "grad_norm": 0.77785466143202, "kl": 0.6640625, "learning_rate": 9.932656045775575e-07, "loss": 0.0, "reward": 2.473437547683716, "reward_std": 0.09972946345806122, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3671875, "step": 760, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 539.5625, "epoch": 0.052370793476016794, "grad_norm": 1.3869326685289423, "kl": 0.65234375, "learning_rate": 9.932479108464362e-07, "loss": 0.0, "reward": 2.0726447105407715, "reward_std": 0.1651388257741928, "rewards/accuracy_reward": 0.6429572105407715, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.2734375, "step": 761, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 501.96875, "epoch": 0.05243961186429014, "grad_norm": 0.0, "kl": 0.58203125, "learning_rate": 9.93230194059844e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 762, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 544.765625, "epoch": 0.052508430252563486, "grad_norm": 0.6717752249920634, "kl": 0.60546875, "learning_rate": 9.932124542186094e-07, "loss": 0.0, "reward": 1.8305041790008545, "reward_std": 0.05654067173600197, "rewards/accuracy_reward": 0.6805040836334229, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 763, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 521.546875, "epoch": 0.052577248640836835, "grad_norm": 1.149179197743064, "kl": 0.58984375, "learning_rate": 9.931946913235613e-07, "loss": -0.0, "reward": 2.496080160140991, "reward_std": 0.13102120161056519, "rewards/accuracy_reward": 0.8179551362991333, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 764, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 555.546875, "epoch": 0.05264606702911018, "grad_norm": 0.8398329151046717, "kl": 0.609375, "learning_rate": 9.931769053755302e-07, "loss": -0.0, "reward": 2.2844629287719727, "reward_std": 0.15572002530097961, "rewards/accuracy_reward": 0.6407129764556885, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 765, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 510.796875, "epoch": 0.05271488541738353, "grad_norm": 0.0, "kl": 0.59375, "learning_rate": 9.931590963753473e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 766, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 511.640625, "epoch": 0.05278370380565687, "grad_norm": 0.8385409011373928, "kl": 0.65234375, "learning_rate": 9.93141264323845e-07, "loss": 0.0, "reward": 2.379469871520996, "reward_std": 0.15982580184936523, "rewards/accuracy_reward": 0.7232198715209961, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 767, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 514.421875, "epoch": 0.05285252219393022, "grad_norm": 5.304876932082813, "kl": 0.5859375, "learning_rate": 9.931234092218572e-07, "loss": 0.0, "reward": 2.4552345275878906, "reward_std": 0.1976258009672165, "rewards/accuracy_reward": 0.8005470037460327, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 768, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 508.859375, "epoch": 0.05292134058220357, "grad_norm": 0.8376373988295228, "kl": 0.57421875, "learning_rate": 9.93105531070218e-07, "loss": 0.0, "reward": 2.512500047683716, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 769, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 559.5625, "epoch": 0.05299015897047691, "grad_norm": 0.7145733471514143, "kl": 0.51953125, "learning_rate": 9.930876298697635e-07, "loss": -0.0, "reward": 2.0237176418304443, "reward_std": 0.07511932402849197, "rewards/accuracy_reward": 0.842467725276947, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 770, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 540.640625, "epoch": 0.05305897735875026, "grad_norm": 0.0, "kl": 0.5546875, "learning_rate": 9.9306970562133e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 771, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 506.734375, "epoch": 0.05312779574702361, "grad_norm": 0.5520757960728395, "kl": 0.6953125, "learning_rate": 9.930517583257555e-07, "loss": 0.0, "reward": 2.012500047683716, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 772, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 540.296875, "epoch": 0.05319661413529695, "grad_norm": 0.0, "kl": 0.61328125, "learning_rate": 9.93033787983879e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 773, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 519.0625, "epoch": 0.0532654325235703, "grad_norm": 0.0, "kl": 0.63671875, "learning_rate": 9.930157945965405e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 774, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 551.5, "epoch": 0.05333425091184364, "grad_norm": 0.7665078672918836, "kl": 0.59765625, "learning_rate": 9.929977781645809e-07, "loss": -0.0, "reward": 1.5572234392166138, "reward_std": 0.07137145102024078, "rewards/accuracy_reward": 0.45097342133522034, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 775, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 520.671875, "epoch": 0.05340306930011699, "grad_norm": 0.6404413784655716, "kl": 0.58203125, "learning_rate": 9.929797386888424e-07, "loss": 0.0, "reward": 2.0067696571350098, "reward_std": 0.007744221482425928, "rewards/accuracy_reward": 0.8067694902420044, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 776, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 525.546875, "epoch": 0.05347188768839034, "grad_norm": 0.4291845123162589, "kl": 0.6640625, "learning_rate": 9.92961676170168e-07, "loss": -0.0, "reward": 2.3062500953674316, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 777, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 517.34375, "epoch": 0.05354070607666368, "grad_norm": 2.2602967136192733, "kl": 0.57421875, "learning_rate": 9.929435906094023e-07, "loss": 0.0, "reward": 2.5966594219207764, "reward_std": 0.08946570754051208, "rewards/accuracy_reward": 0.8997844457626343, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 778, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 531.640625, "epoch": 0.05360952446493703, "grad_norm": 4.278255443713556, "kl": 0.58984375, "learning_rate": 9.929254820073905e-07, "loss": 0.0, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 779, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 536.390625, "epoch": 0.05367834285321038, "grad_norm": 0.6891134431915749, "kl": 0.55078125, "learning_rate": 9.92907350364979e-07, "loss": 0.0, "reward": 2.528125047683716, "reward_std": 0.11363068222999573, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 780, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 513.90625, "epoch": 0.053747161241483724, "grad_norm": 0.0, "kl": 0.62109375, "learning_rate": 9.928891956830153e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 781, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 542.234375, "epoch": 0.05381597962975707, "grad_norm": 1.3621082325176017, "kl": 0.6171875, "learning_rate": 9.92871017962348e-07, "loss": -0.0, "reward": 2.6237668991088867, "reward_std": 0.001825083396397531, "rewards/accuracy_reward": 0.9237669110298157, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 782, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 534.28125, "epoch": 0.053884798018030415, "grad_norm": 1.0235267587346069, "kl": 0.54296875, "learning_rate": 9.928528172038272e-07, "loss": -0.0, "reward": 2.2555339336395264, "reward_std": 0.06577550619840622, "rewards/accuracy_reward": 0.6055340766906738, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 783, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 489.984375, "epoch": 0.053953616406303764, "grad_norm": 0.7490482260089493, "kl": 0.58984375, "learning_rate": 9.92834593408303e-07, "loss": 0.0, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 784, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 540.28125, "epoch": 0.054022434794577114, "grad_norm": 1.490399741476375, "kl": 0.5390625, "learning_rate": 9.928163465766275e-07, "loss": 0.0, "reward": 2.4375, "reward_std": 0.186244398355484, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 785, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 495.015625, "epoch": 0.054091253182850456, "grad_norm": 1.153932612243145, "kl": 0.6953125, "learning_rate": 9.927980767096534e-07, "loss": 0.0, "reward": 2.46875, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 786, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 520.359375, "epoch": 0.054160071571123805, "grad_norm": 0.0, "kl": 0.6171875, "learning_rate": 9.927797838082351e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 787, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 501.015625, "epoch": 0.054228889959397154, "grad_norm": 0.4719370687363743, "kl": 0.61328125, "learning_rate": 9.927614678732272e-07, "loss": -0.0, "reward": 1.9727070331573486, "reward_std": 0.012191811576485634, "rewards/accuracy_reward": 0.7977069616317749, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 788, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 494.3125, "epoch": 0.0542977083476705, "grad_norm": 0.6457495170436802, "kl": 0.6875, "learning_rate": 9.927431289054863e-07, "loss": 0.0, "reward": 2.1812500953674316, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 789, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 494.359375, "epoch": 0.054366526735943846, "grad_norm": 0.7282955093368674, "kl": 0.67578125, "learning_rate": 9.92724766905869e-07, "loss": 0.0, "reward": 2.543750047683716, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 790, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 500.765625, "epoch": 0.05443534512421719, "grad_norm": 0.0, "kl": 0.68359375, "learning_rate": 9.92706381875234e-07, "loss": 0.0, "reward": 2.4250001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 791, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 499.6875, "epoch": 0.05450416351249054, "grad_norm": 1.529845876482441, "kl": 0.609375, "learning_rate": 9.926879738144407e-07, "loss": 0.0, "reward": 1.907853364944458, "reward_std": 0.1901736855506897, "rewards/accuracy_reward": 0.7484782934188843, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 792, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 484.921875, "epoch": 0.05457298190076389, "grad_norm": 1.2415174899544743, "kl": 0.578125, "learning_rate": 9.926695427243492e-07, "loss": 0.0, "reward": 1.8987185955047607, "reward_std": 0.15885670483112335, "rewards/accuracy_reward": 0.7518435716629028, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 793, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 500.578125, "epoch": 0.05464180028903723, "grad_norm": 0.5194796153640521, "kl": 0.6640625, "learning_rate": 9.926510886058213e-07, "loss": 0.0, "reward": 2.2906250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 794, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 494.796875, "epoch": 0.05471061867731058, "grad_norm": 0.0, "kl": 0.6015625, "learning_rate": 9.926326114597196e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 795, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 481.140625, "epoch": 0.05477943706558393, "grad_norm": 1.4351896736037257, "kl": 0.6015625, "learning_rate": 9.926141112869075e-07, "loss": 0.0, "reward": 2.5656251907348633, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 796, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 495.828125, "epoch": 0.05484825545385727, "grad_norm": 0.72505329646668, "kl": 0.67578125, "learning_rate": 9.9259558808825e-07, "loss": 0.0, "reward": 1.9699231386184692, "reward_std": 0.017120089381933212, "rewards/accuracy_reward": 0.4324231445789337, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 797, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 512.578125, "epoch": 0.05491707384213062, "grad_norm": 0.9807645003739199, "kl": 0.5859375, "learning_rate": 9.925770418646126e-07, "loss": 0.0, "reward": 1.7393653392791748, "reward_std": 0.22471287846565247, "rewards/accuracy_reward": 0.6612403392791748, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.0, "step": 798, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 481.234375, "epoch": 0.05498589223040396, "grad_norm": 0.6302523085115247, "kl": 0.5859375, "learning_rate": 9.925584726168627e-07, "loss": -0.0, "reward": 2.07480788230896, "reward_std": 0.032512787729501724, "rewards/accuracy_reward": 0.8748078346252441, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 799, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 470.421875, "epoch": 0.05505471061867731, "grad_norm": 0.961434844829736, "kl": 0.68359375, "learning_rate": 9.925398803458678e-07, "loss": 0.0, "reward": 2.481250047683716, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 800, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 483.125, "epoch": 0.05512352900695066, "grad_norm": 0.6978565113683662, "kl": 0.6875, "learning_rate": 9.92521265052497e-07, "loss": -0.0, "reward": 2.4125001430511475, "reward_std": 0.08017838001251221, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 801, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 514.453125, "epoch": 0.055192347395224, "grad_norm": 0.7160069481281341, "kl": 0.53125, "learning_rate": 9.925026267376207e-07, "loss": 0.0, "reward": 2.17415452003479, "reward_std": 0.0864952951669693, "rewards/accuracy_reward": 0.5366545915603638, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 802, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 483.671875, "epoch": 0.05526116578349735, "grad_norm": 0.0, "kl": 0.62890625, "learning_rate": 9.924839654021098e-07, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 803, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 482.171875, "epoch": 0.0553299841717707, "grad_norm": 0.4286919575891598, "kl": 0.59375, "learning_rate": 9.924652810468367e-07, "loss": -0.0, "reward": 1.985488772392273, "reward_std": 0.015174410305917263, "rewards/accuracy_reward": 0.810488760471344, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 804, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 483.625, "epoch": 0.05539880256004404, "grad_norm": 1.4363112307805868, "kl": 0.66796875, "learning_rate": 9.92446573672675e-07, "loss": 0.0, "reward": 2.4848546981811523, "reward_std": 0.07794313877820969, "rewards/accuracy_reward": 0.9051671624183655, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3828125, "step": 805, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 474.484375, "epoch": 0.05546762094831739, "grad_norm": 1.0410926252319563, "kl": 0.609375, "learning_rate": 9.924278432804983e-07, "loss": -0.0, "reward": 1.8941075801849365, "reward_std": 0.1812698245048523, "rewards/accuracy_reward": 0.7347323894500732, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 806, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 506.71875, "epoch": 0.055536439336590734, "grad_norm": 1.6270290462965127, "kl": 0.609375, "learning_rate": 9.92409089871183e-07, "loss": -0.0, "reward": 2.344177484512329, "reward_std": 0.01113303005695343, "rewards/accuracy_reward": 0.7691776156425476, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 807, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 506.328125, "epoch": 0.055605257724864084, "grad_norm": 1.622069569391701, "kl": 0.65234375, "learning_rate": 9.923903134456053e-07, "loss": -0.0, "reward": 2.0161972045898438, "reward_std": 0.07260177284479141, "rewards/accuracy_reward": 0.8349469304084778, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 808, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 497.046875, "epoch": 0.05567407611313743, "grad_norm": 0.9900483254744848, "kl": 0.6640625, "learning_rate": 9.923715140046428e-07, "loss": 0.0, "reward": 2.0541281700134277, "reward_std": 0.08095496147871017, "rewards/accuracy_reward": 0.8635031580924988, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 809, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 477.640625, "epoch": 0.055742894501410775, "grad_norm": 0.801127352578369, "kl": 0.6484375, "learning_rate": 9.923526915491745e-07, "loss": -0.0, "reward": 2.328125, "reward_std": 0.21361474692821503, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 810, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 451.890625, "epoch": 0.055811712889684124, "grad_norm": 0.5193696680130832, "kl": 0.6640625, "learning_rate": 9.9233384608008e-07, "loss": -0.0, "reward": 1.871168613433838, "reward_std": 0.013115942478179932, "rewards/accuracy_reward": 0.7211686372756958, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 811, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 490.78125, "epoch": 0.055880531277957474, "grad_norm": 0.9415781700119208, "kl": 0.62890625, "learning_rate": 9.9231497759824e-07, "loss": -0.0, "reward": 2.4044909477233887, "reward_std": 0.0164642333984375, "rewards/accuracy_reward": 0.7294908165931702, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 812, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 482.890625, "epoch": 0.055949349666230816, "grad_norm": 0.7078804346377744, "kl": 0.54296875, "learning_rate": 9.922960861045366e-07, "loss": 0.0, "reward": 1.985068917274475, "reward_std": 0.046699803322553635, "rewards/accuracy_reward": 0.8100689053535461, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 813, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 456.828125, "epoch": 0.056018168054504165, "grad_norm": 1.2961331153687412, "kl": 0.66015625, "learning_rate": 9.92277171599853e-07, "loss": 0.0, "reward": 2.5336618423461914, "reward_std": 0.057328201830387115, "rewards/accuracy_reward": 0.8586617708206177, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 814, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 456.015625, "epoch": 0.05608698644277751, "grad_norm": 0.3982980564894892, "kl": 0.63671875, "learning_rate": 9.922582340850731e-07, "loss": 0.0, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 815, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 481.5, "epoch": 0.05615580483105086, "grad_norm": 0.0, "kl": 0.66015625, "learning_rate": 9.922392735610823e-07, "loss": 0.0, "reward": 2.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 816, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 467.90625, "epoch": 0.056224623219324206, "grad_norm": 0.0, "kl": 0.71484375, "learning_rate": 9.922202900287666e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 817, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 476.984375, "epoch": 0.05629344160759755, "grad_norm": 0.0, "kl": 0.6640625, "learning_rate": 9.922012834890136e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 818, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 457.90625, "epoch": 0.0563622599958709, "grad_norm": 1.258496372702601, "kl": 0.640625, "learning_rate": 9.921822539427114e-07, "loss": 0.0, "reward": 2.106250047683716, "reward_std": 0.13321137428283691, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 819, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 450.1875, "epoch": 0.05643107838414425, "grad_norm": 0.0, "kl": 0.671875, "learning_rate": 9.921632013907497e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 820, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 481.453125, "epoch": 0.05649989677241759, "grad_norm": 0.673777887744338, "kl": 0.67578125, "learning_rate": 9.92144125834019e-07, "loss": 0.0, "reward": 2.580007314682007, "reward_std": 0.07400016486644745, "rewards/accuracy_reward": 0.8862572908401489, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 821, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 459.453125, "epoch": 0.05656871516069094, "grad_norm": 1.427056927425111, "kl": 0.60546875, "learning_rate": 9.921250272734108e-07, "loss": 0.0, "reward": 2.1813197135925293, "reward_std": 0.10575653612613678, "rewards/accuracy_reward": 0.575069785118103, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 822, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 480.234375, "epoch": 0.05663753354896428, "grad_norm": 1.9117976688426734, "kl": 0.62890625, "learning_rate": 9.92105905709818e-07, "loss": -0.0, "reward": 1.834474802017212, "reward_std": 0.047086264938116074, "rewards/accuracy_reward": 0.6844747066497803, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 823, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 464.671875, "epoch": 0.05670635193723763, "grad_norm": 0.5172512127040291, "kl": 0.59375, "learning_rate": 9.920867611441344e-07, "loss": 0.0, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 824, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 472.390625, "epoch": 0.05677517032551098, "grad_norm": 1.4729699550872757, "kl": 0.62890625, "learning_rate": 9.920675935772547e-07, "loss": 0.0, "reward": 2.1139724254608154, "reward_std": 0.10369522869586945, "rewards/accuracy_reward": 0.9202224016189575, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 825, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 457.34375, "epoch": 0.05684398871378432, "grad_norm": 0.8206843701197649, "kl": 0.609375, "learning_rate": 9.92048403010075e-07, "loss": -0.0, "reward": 2.092804431915283, "reward_std": 0.11145544052124023, "rewards/accuracy_reward": 0.9053043723106384, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 826, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 475.0, "epoch": 0.05691280710205767, "grad_norm": 7.659757409258243, "kl": 0.6171875, "learning_rate": 9.92029189443492e-07, "loss": -0.0, "reward": 2.5035552978515625, "reward_std": 0.15352028608322144, "rewards/accuracy_reward": 0.8254302144050598, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 827, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 474.515625, "epoch": 0.05698162549033102, "grad_norm": 0.0, "kl": 0.6328125, "learning_rate": 9.92009952878404e-07, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 828, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 466.484375, "epoch": 0.05705044387860436, "grad_norm": 0.850925967611214, "kl": 0.65625, "learning_rate": 9.919906933157104e-07, "loss": -0.0, "reward": 2.1437501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 829, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 466.140625, "epoch": 0.05711926226687771, "grad_norm": 1.245324149990873, "kl": 0.60546875, "learning_rate": 9.919714107563109e-07, "loss": -0.0, "reward": 2.358517646789551, "reward_std": 0.07207437604665756, "rewards/accuracy_reward": 0.702267587184906, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 830, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 466.21875, "epoch": 0.057188080655151054, "grad_norm": 0.49431839513429193, "kl": 0.7265625, "learning_rate": 9.919521052011072e-07, "loss": 0.0, "reward": 2.692187547683716, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 831, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 479.703125, "epoch": 0.0572568990434244, "grad_norm": 0.6739625459620189, "kl": 0.6484375, "learning_rate": 9.919327766510017e-07, "loss": -0.0, "reward": 2.632443428039551, "reward_std": 0.008956074714660645, "rewards/accuracy_reward": 0.9324433207511902, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 832, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 533.296875, "epoch": 0.05732571743169775, "grad_norm": 0.6987644529884481, "kl": 0.65625, "learning_rate": 9.919134251068978e-07, "loss": 0.0, "reward": 2.477809429168701, "reward_std": 0.005333346780389547, "rewards/accuracy_reward": 0.8403095602989197, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 833, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 461.3125, "epoch": 0.057394535819971094, "grad_norm": 0.8058058266055779, "kl": 0.640625, "learning_rate": 9.918940505696997e-07, "loss": 0.0, "reward": 1.9553303718566895, "reward_std": 0.09916700422763824, "rewards/accuracy_reward": 0.7865803241729736, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 834, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 496.25, "epoch": 0.057463354208244444, "grad_norm": 0.5776101484234749, "kl": 0.6796875, "learning_rate": 9.918746530403133e-07, "loss": -0.0, "reward": 2.225714921951294, "reward_std": 0.01298416219651699, "rewards/accuracy_reward": 0.5757149457931519, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 835, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 482.25, "epoch": 0.05753217259651779, "grad_norm": 1.1100485728667693, "kl": 0.6484375, "learning_rate": 9.918552325196454e-07, "loss": 0.0, "reward": 2.123046875, "reward_std": 0.00748702697455883, "rewards/accuracy_reward": 0.923046886920929, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 836, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 470.65625, "epoch": 0.057600990984791135, "grad_norm": 1.7358702495190932, "kl": 0.6015625, "learning_rate": 9.918357890086036e-07, "loss": -0.0, "reward": 2.1531248092651367, "reward_std": 0.11363068222999573, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 837, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 476.859375, "epoch": 0.057669809373064485, "grad_norm": 1.7272815093215523, "kl": 0.64453125, "learning_rate": 9.918163225080966e-07, "loss": -0.0, "reward": 1.6672039031982422, "reward_std": 0.21785052120685577, "rewards/accuracy_reward": 0.5453289151191711, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 838, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 499.171875, "epoch": 0.05773862776133783, "grad_norm": 0.0, "kl": 0.6171875, "learning_rate": 9.917968330190348e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 839, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 495.078125, "epoch": 0.057807446149611176, "grad_norm": 0.8663212166986979, "kl": 0.640625, "learning_rate": 9.917773205423286e-07, "loss": 0.0, "reward": 1.9102141857147217, "reward_std": 0.012192726135253906, "rewards/accuracy_reward": 0.735214114189148, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 840, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 477.34375, "epoch": 0.057876264537884525, "grad_norm": 0.7697115717158575, "kl": 0.62109375, "learning_rate": 9.917577850788904e-07, "loss": 0.0, "reward": 1.9187500476837158, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 841, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 468.109375, "epoch": 0.05794508292615787, "grad_norm": 0.4154798728113613, "kl": 0.64453125, "learning_rate": 9.91738226629633e-07, "loss": 0.0, "reward": 1.7125000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 842, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 513.03125, "epoch": 0.05801390131443122, "grad_norm": 0.8497134479250215, "kl": 0.61328125, "learning_rate": 9.917186451954708e-07, "loss": 0.0, "reward": 2.028931140899658, "reward_std": 0.14334794878959656, "rewards/accuracy_reward": 0.8508062362670898, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 843, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 474.8125, "epoch": 0.058082719702704566, "grad_norm": 0.0, "kl": 0.69140625, "learning_rate": 9.916990407773195e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 844, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 503.71875, "epoch": 0.05815153809097791, "grad_norm": 0.9413815466984229, "kl": 0.6875, "learning_rate": 9.916794133760946e-07, "loss": -0.0, "reward": 2.3657727241516113, "reward_std": 0.08113209903240204, "rewards/accuracy_reward": 0.7095228433609009, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 845, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 506.171875, "epoch": 0.05822035647925126, "grad_norm": 0.4063047252400188, "kl": 0.59765625, "learning_rate": 9.916597629927143e-07, "loss": -0.0, "reward": 2.0132651329040527, "reward_std": 0.001999512780457735, "rewards/accuracy_reward": 0.8132649064064026, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 846, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 466.703125, "epoch": 0.0582891748675246, "grad_norm": 1.397387290717169, "kl": 0.71484375, "learning_rate": 9.916400896280965e-07, "loss": -0.0, "reward": 1.8705806732177734, "reward_std": 0.23341825604438782, "rewards/accuracy_reward": 0.7143307328224182, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 847, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 479.875, "epoch": 0.05835799325579795, "grad_norm": 0.9968937999302574, "kl": 0.59375, "learning_rate": 9.916203932831616e-07, "loss": -0.0, "reward": 1.8608076572418213, "reward_std": 0.06384649872779846, "rewards/accuracy_reward": 0.6889326572418213, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 848, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 490.75, "epoch": 0.0584268116440713, "grad_norm": 0.5096877954161557, "kl": 0.6171875, "learning_rate": 9.916006739588292e-07, "loss": 0.0, "reward": 2.1812500953674316, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 849, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 489.84375, "epoch": 0.05849563003234464, "grad_norm": 1.7817528192671679, "kl": 0.57421875, "learning_rate": 9.915809316560216e-07, "loss": -0.0, "reward": 2.499016046524048, "reward_std": 0.08528254181146622, "rewards/accuracy_reward": 0.8083910346031189, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 850, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 487.875, "epoch": 0.05856444842061799, "grad_norm": 1.1620906778721853, "kl": 0.609375, "learning_rate": 9.915611663756618e-07, "loss": -0.0, "reward": 2.512500047683716, "reward_std": 0.1496148705482483, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 851, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 507.265625, "epoch": 0.05863326680889134, "grad_norm": 0.9220920664806775, "kl": 0.56640625, "learning_rate": 9.91541378118673e-07, "loss": -0.0, "reward": 2.5818207263946533, "reward_std": 0.09211525321006775, "rewards/accuracy_reward": 0.8911956548690796, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 852, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 534.984375, "epoch": 0.05870208519716468, "grad_norm": 0.7804662577570799, "kl": 0.66796875, "learning_rate": 9.915215668859808e-07, "loss": 0.0, "reward": 2.205552101135254, "reward_std": 0.16000692546367645, "rewards/accuracy_reward": 0.5743018388748169, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 853, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 501.046875, "epoch": 0.05877090358543803, "grad_norm": 4.321140730205623, "kl": 0.578125, "learning_rate": 9.91501732678511e-07, "loss": -0.0, "reward": 2.541599750518799, "reward_std": 0.1475170999765396, "rewards/accuracy_reward": 0.8697248101234436, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 854, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 514.015625, "epoch": 0.05883972197371137, "grad_norm": 1.6090615348172912, "kl": 0.5546875, "learning_rate": 9.914818754971905e-07, "loss": -0.0, "reward": 2.385650396347046, "reward_std": 0.01308409497141838, "rewards/accuracy_reward": 0.7106505036354065, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 855, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 525.328125, "epoch": 0.05890854036198472, "grad_norm": 0.0, "kl": 0.60546875, "learning_rate": 9.914619953429478e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 856, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 534.640625, "epoch": 0.05897735875025807, "grad_norm": 0.7985328157645258, "kl": 0.59765625, "learning_rate": 9.914420922167119e-07, "loss": 0.0, "reward": 2.0962369441986084, "reward_std": 0.06922326236963272, "rewards/accuracy_reward": 0.9149869680404663, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 857, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 506.859375, "epoch": 0.059046177138531414, "grad_norm": 1.0666201602066534, "kl": 0.56640625, "learning_rate": 9.91422166119413e-07, "loss": 0.0, "reward": 1.9562500715255737, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 858, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 561.375, "epoch": 0.05911499552680476, "grad_norm": 1.2459044465873998, "kl": 0.6640625, "learning_rate": 9.91402217051983e-07, "loss": -0.0, "reward": 2.173535108566284, "reward_std": 0.10256610810756683, "rewards/accuracy_reward": 0.5797851085662842, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.5, "step": 859, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 502.0625, "epoch": 0.05918381391507811, "grad_norm": 0.0, "kl": 0.62109375, "learning_rate": 9.91382245015354e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 860, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 526.015625, "epoch": 0.059252632303351455, "grad_norm": 0.7450000932994488, "kl": 0.69921875, "learning_rate": 9.913622500104594e-07, "loss": 0.0, "reward": 2.1781249046325684, "reward_std": 0.09836838394403458, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.015625, "step": 861, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 505.609375, "epoch": 0.059321450691624804, "grad_norm": 0.0, "kl": 0.703125, "learning_rate": 9.91342232038234e-07, "loss": 0.0, "reward": 2.524074077606201, "reward_std": 0.0, "rewards/accuracy_reward": 0.8490740656852722, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 862, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 514.0625, "epoch": 0.059390269079898146, "grad_norm": 0.7194319638842772, "kl": 0.5859375, "learning_rate": 9.913221910996137e-07, "loss": -0.0, "reward": 2.5875000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 863, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 531.671875, "epoch": 0.059459087468171495, "grad_norm": 1.0011915939489606, "kl": 0.69921875, "learning_rate": 9.913021271955347e-07, "loss": 0.0, "reward": 2.4749999046325684, "reward_std": 0.17550253868103027, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 864, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 500.078125, "epoch": 0.059527905856444845, "grad_norm": 1.3205322718436454, "kl": 0.5859375, "learning_rate": 9.912820403269352e-07, "loss": -0.0, "reward": 1.9375, "reward_std": 0.22470125555992126, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 865, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 513.140625, "epoch": 0.05959672424471819, "grad_norm": 0.46278237043083575, "kl": 0.6015625, "learning_rate": 9.912619304947542e-07, "loss": 0.0, "reward": 1.915624976158142, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 866, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 515.828125, "epoch": 0.059665542632991536, "grad_norm": 0.6439100682610643, "kl": 0.7109375, "learning_rate": 9.912417976999317e-07, "loss": -0.0, "reward": 2.5656251907348633, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 867, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 532.640625, "epoch": 0.059734361021264885, "grad_norm": 1.5399527291675945, "kl": 0.58203125, "learning_rate": 9.912216419434083e-07, "loss": -0.0, "reward": 1.7899250984191895, "reward_std": 0.056767258793115616, "rewards/accuracy_reward": 0.6336750388145447, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 868, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 521.4375, "epoch": 0.05980317940953823, "grad_norm": 0.0, "kl": 0.703125, "learning_rate": 9.912014632261265e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 869, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 528.453125, "epoch": 0.05987199779781158, "grad_norm": 1.607358658081809, "kl": 0.59375, "learning_rate": 9.911812615490294e-07, "loss": -0.0, "reward": 2.3390626907348633, "reward_std": 0.1912599802017212, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 870, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 498.546875, "epoch": 0.05994081618608492, "grad_norm": 0.7789706257590141, "kl": 0.60546875, "learning_rate": 9.911610369130611e-07, "loss": 0.0, "reward": 2.5046799182891846, "reward_std": 0.14253027737140656, "rewards/accuracy_reward": 0.8296799063682556, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 871, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 588.40625, "epoch": 0.06000963457435827, "grad_norm": 4.118948015454186, "kl": 0.65234375, "learning_rate": 9.911407893191673e-07, "loss": 0.0, "reward": 1.8127121925354004, "reward_std": 0.1558363437652588, "rewards/accuracy_reward": 0.6939622163772583, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.0, "step": 872, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 528.578125, "epoch": 0.06007845296263162, "grad_norm": 0.9837253394251737, "kl": 0.6015625, "learning_rate": 9.911205187682943e-07, "loss": 0.0, "reward": 1.881250023841858, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 873, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 500.03125, "epoch": 0.06014727135090496, "grad_norm": 1.1627725979921653, "kl": 0.703125, "learning_rate": 9.911002252613893e-07, "loss": -0.0, "reward": 2.149162769317627, "reward_std": 0.12353882193565369, "rewards/accuracy_reward": 0.627287745475769, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 874, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 519.671875, "epoch": 0.06021608973917831, "grad_norm": 0.6589101166647992, "kl": 0.7109375, "learning_rate": 9.910799087994012e-07, "loss": 0.0, "reward": 2.3774614334106445, "reward_std": 0.08233268558979034, "rewards/accuracy_reward": 0.7712115049362183, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 875, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 484.921875, "epoch": 0.06028490812745166, "grad_norm": 0.5608872970343663, "kl": 0.59375, "learning_rate": 9.910595693832796e-07, "loss": -0.0, "reward": 2.4852418899536133, "reward_std": 0.00733196223154664, "rewards/accuracy_reward": 0.8102417588233948, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 876, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 547.09375, "epoch": 0.060353726515725, "grad_norm": 0.8397488874703729, "kl": 0.58203125, "learning_rate": 9.91039207013975e-07, "loss": 0.0, "reward": 2.4691295623779297, "reward_std": 0.14414316415786743, "rewards/accuracy_reward": 0.809754490852356, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.5, "step": 877, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 495.734375, "epoch": 0.06042254490399835, "grad_norm": 0.0, "kl": 0.625, "learning_rate": 9.910188216924394e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 878, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 489.171875, "epoch": 0.06049136329227169, "grad_norm": 0.6002554984528623, "kl": 0.5625, "learning_rate": 9.909984134196256e-07, "loss": 0.0, "reward": 2.3625001907348633, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 879, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 473.421875, "epoch": 0.06056018168054504, "grad_norm": 1.0119958398105615, "kl": 0.5859375, "learning_rate": 9.909779821964874e-07, "loss": 0.0, "reward": 1.9442262649536133, "reward_std": 0.05971395596861839, "rewards/accuracy_reward": 0.7723512649536133, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 880, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 484.59375, "epoch": 0.06062900006881839, "grad_norm": 0.7644573898575978, "kl": 0.5546875, "learning_rate": 9.9095752802398e-07, "loss": -0.0, "reward": 2.5876238346099854, "reward_std": 0.09207373112440109, "rewards/accuracy_reward": 0.9001235961914062, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 881, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 475.296875, "epoch": 0.06069781845709173, "grad_norm": 0.6318830267390391, "kl": 0.56640625, "learning_rate": 9.909370509030591e-07, "loss": -0.0, "reward": 2.453125, "reward_std": 0.11363068222999573, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 882, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 520.78125, "epoch": 0.06076663684536508, "grad_norm": 0.96583572145232, "kl": 0.61328125, "learning_rate": 9.909165508346824e-07, "loss": -0.0, "reward": 2.3699326515197754, "reward_std": 0.2607654333114624, "rewards/accuracy_reward": 0.7293076515197754, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.5, "step": 883, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 527.015625, "epoch": 0.06083545523363843, "grad_norm": 0.7005272788539291, "kl": 0.59375, "learning_rate": 9.908960278198076e-07, "loss": 0.0, "reward": 1.9812322854995728, "reward_std": 0.09956598281860352, "rewards/accuracy_reward": 0.8187322020530701, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 884, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 486.84375, "epoch": 0.060904273621911774, "grad_norm": 0.6505964966789426, "kl": 0.5703125, "learning_rate": 9.908754818593943e-07, "loss": 0.0, "reward": 2.4156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 885, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 533.921875, "epoch": 0.06097309201018512, "grad_norm": 1.4773766734861342, "kl": 0.546875, "learning_rate": 9.908549129544029e-07, "loss": 0.0, "reward": 1.8861799240112305, "reward_std": 0.10941977798938751, "rewards/accuracy_reward": 0.7236799001693726, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 886, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 503.046875, "epoch": 0.061041910398458465, "grad_norm": 0.4956155852592207, "kl": 0.5859375, "learning_rate": 9.908343211057946e-07, "loss": 0.0, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 887, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 547.21875, "epoch": 0.061110728786731815, "grad_norm": 0.4293023681465424, "kl": 0.65234375, "learning_rate": 9.908137063145318e-07, "loss": 0.0, "reward": 1.8812501430511475, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 888, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 497.0625, "epoch": 0.061179547175005164, "grad_norm": 0.0, "kl": 0.5546875, "learning_rate": 9.907930685815787e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 889, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 498.125, "epoch": 0.061248365563278506, "grad_norm": 0.0, "kl": 0.5546875, "learning_rate": 9.907724079078993e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 890, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 504.53125, "epoch": 0.061317183951551855, "grad_norm": 0.0, "kl": 0.55859375, "learning_rate": 9.907517242944597e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 891, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 556.609375, "epoch": 0.061386002339825205, "grad_norm": 1.2045220106209051, "kl": 0.64453125, "learning_rate": 9.907310177422262e-07, "loss": 0.0, "reward": 2.076951026916504, "reward_std": 0.05099352449178696, "rewards/accuracy_reward": 0.8691383600234985, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0078125, "step": 892, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 512.828125, "epoch": 0.06145482072809855, "grad_norm": 1.316114067913076, "kl": 0.58203125, "learning_rate": 9.907102882521676e-07, "loss": -0.0, "reward": 1.668143630027771, "reward_std": 0.15628772974014282, "rewards/accuracy_reward": 0.5775186419487, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 893, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 567.625, "epoch": 0.061523639116371896, "grad_norm": 0.36572483147487816, "kl": 0.64453125, "learning_rate": 9.90689535825252e-07, "loss": 0.0, "reward": 1.9921875, "reward_std": 0.0521576851606369, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.2578125, "step": 894, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 526.921875, "epoch": 0.06159245750464524, "grad_norm": 0.0, "kl": 0.5703125, "learning_rate": 9.906687604624498e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 895, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 493.640625, "epoch": 0.06166127589291859, "grad_norm": 0.573097391381081, "kl": 0.6171875, "learning_rate": 9.906479621647318e-07, "loss": -0.0, "reward": 1.9375, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 896, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 507.625, "epoch": 0.06173009428119194, "grad_norm": 0.0, "kl": 0.58203125, "learning_rate": 9.906271409330704e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 897, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 506.703125, "epoch": 0.06179891266946528, "grad_norm": 0.44195896444274124, "kl": 0.6015625, "learning_rate": 9.906062967684387e-07, "loss": -0.0, "reward": 2.095925807952881, "reward_std": 0.06909792125225067, "rewards/accuracy_reward": 0.9146756529808044, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 898, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 489.640625, "epoch": 0.06186773105773863, "grad_norm": 0.5119651252688006, "kl": 0.58203125, "learning_rate": 9.905854296718113e-07, "loss": 0.0, "reward": 2.0075740814208984, "reward_std": 0.03381015732884407, "rewards/accuracy_reward": 0.8325741291046143, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 899, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 482.15625, "epoch": 0.06193654944601198, "grad_norm": 1.9197643331119687, "kl": 0.640625, "learning_rate": 9.905645396441632e-07, "loss": 0.0, "reward": 1.9750001430511475, "reward_std": 0.08017837256193161, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 900, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 510.140625, "epoch": 0.06200536783428532, "grad_norm": 0.45563991568014506, "kl": 0.53125, "learning_rate": 9.90543626686471e-07, "loss": -0.0, "reward": 2.472123146057129, "reward_std": 0.0037839387077838182, "rewards/accuracy_reward": 0.7971231937408447, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 901, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 479.53125, "epoch": 0.06207418622255867, "grad_norm": 0.44975245112836676, "kl": 0.60546875, "learning_rate": 9.905226907997122e-07, "loss": -0.0, "reward": 1.9205671548843384, "reward_std": 0.015537344850599766, "rewards/accuracy_reward": 0.7611920833587646, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 902, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 472.0, "epoch": 0.06214300461083201, "grad_norm": 0.444991863209192, "kl": 0.6171875, "learning_rate": 9.905017319848654e-07, "loss": -0.0, "reward": 1.9940803050994873, "reward_std": 0.007883414626121521, "rewards/accuracy_reward": 0.8190802335739136, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 903, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 531.75, "epoch": 0.06221182299910536, "grad_norm": 0.0, "kl": 0.62890625, "learning_rate": 9.904807502429101e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 904, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 465.53125, "epoch": 0.06228064138737871, "grad_norm": 0.5041185694724729, "kl": 0.5859375, "learning_rate": 9.904597455748273e-07, "loss": -0.0, "reward": 2.122070074081421, "reward_std": 0.0049204519018530846, "rewards/accuracy_reward": 0.9220700263977051, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 905, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 490.03125, "epoch": 0.06234945977565205, "grad_norm": 0.5768129554700916, "kl": 0.56640625, "learning_rate": 9.904387179815988e-07, "loss": 0.0, "reward": 1.9718749523162842, "reward_std": 0.12182654440402985, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 906, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 504.546875, "epoch": 0.0624182781639254, "grad_norm": 0.6397149732294399, "kl": 0.60546875, "learning_rate": 9.904176674642072e-07, "loss": -0.0, "reward": 1.8314728736877441, "reward_std": 0.13773676753044128, "rewards/accuracy_reward": 0.6939728856086731, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 907, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 489.046875, "epoch": 0.06248709655219875, "grad_norm": 1.1412397703968569, "kl": 0.55078125, "learning_rate": 9.903965940236367e-07, "loss": 0.0, "reward": 2.4182214736938477, "reward_std": 0.12737250328063965, "rewards/accuracy_reward": 0.7729089856147766, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4765625, "step": 908, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 500.84375, "epoch": 0.0625559149404721, "grad_norm": 2.7298957690483943, "kl": 0.60546875, "learning_rate": 9.903754976608723e-07, "loss": -0.0, "reward": 1.8806135654449463, "reward_std": 0.08080194145441055, "rewards/accuracy_reward": 0.7212384939193726, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 909, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 519.078125, "epoch": 0.06262473332874544, "grad_norm": 0.6110141372852916, "kl": 0.640625, "learning_rate": 9.903543783768998e-07, "loss": 0.0, "reward": 2.262500047683716, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0625, "step": 910, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 537.0, "epoch": 0.06269355171701879, "grad_norm": 1.2499094764768175, "kl": 0.6171875, "learning_rate": 9.903332361727068e-07, "loss": 0.0, "reward": 1.8276029825210571, "reward_std": 0.2513548731803894, "rewards/accuracy_reward": 0.6510404944419861, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.0703125, "step": 911, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 532.078125, "epoch": 0.06276237010529213, "grad_norm": 1.435819932551718, "kl": 0.6328125, "learning_rate": 9.903120710492811e-07, "loss": 0.0, "reward": 2.3970940113067627, "reward_std": 0.023995941504836082, "rewards/accuracy_reward": 0.7924065589904785, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4296875, "step": 912, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 489.796875, "epoch": 0.06283118849356548, "grad_norm": 1.1473062240854073, "kl": 0.6796875, "learning_rate": 9.902908830076124e-07, "loss": -0.0, "reward": 2.5100793838500977, "reward_std": 0.11307895183563232, "rewards/accuracy_reward": 0.8694541454315186, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.453125, "step": 913, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 462.625, "epoch": 0.06290000688183883, "grad_norm": 0.0, "kl": 0.7109375, "learning_rate": 9.902696720486908e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 914, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 482.515625, "epoch": 0.06296882527011217, "grad_norm": 0.5164211356765994, "kl": 0.5625, "learning_rate": 9.902484381735077e-07, "loss": 0.0, "reward": 2.2312498092651367, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 915, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 495.375, "epoch": 0.06303764365838552, "grad_norm": 2.159367765481361, "kl": 0.66015625, "learning_rate": 9.90227181383056e-07, "loss": -0.0, "reward": 1.9109501838684082, "reward_std": 0.08736293762922287, "rewards/accuracy_reward": 0.7453251481056213, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 916, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 479.71875, "epoch": 0.06310646204665887, "grad_norm": 0.3781396833955475, "kl": 0.6171875, "learning_rate": 9.90205901678329e-07, "loss": 0.0, "reward": 2.6194868087768555, "reward_std": 0.003704826580360532, "rewards/accuracy_reward": 0.919486939907074, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 917, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 490.515625, "epoch": 0.06317528043493222, "grad_norm": 0.8419876232441998, "kl": 0.59765625, "learning_rate": 9.901845990603212e-07, "loss": -0.0, "reward": 2.0701634883880615, "reward_std": 0.0807858407497406, "rewards/accuracy_reward": 0.879538357257843, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 918, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 470.171875, "epoch": 0.06324409882320556, "grad_norm": 0.5498860164939794, "kl": 0.6171875, "learning_rate": 9.901632735300285e-07, "loss": 0.0, "reward": 1.715194821357727, "reward_std": 0.07093051820993423, "rewards/accuracy_reward": 0.5839447975158691, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 919, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 513.953125, "epoch": 0.06331291721147891, "grad_norm": 0.6849458059277901, "kl": 0.63671875, "learning_rate": 9.90141925088448e-07, "loss": -0.0, "reward": 2.3186564445495605, "reward_std": 0.06323608756065369, "rewards/accuracy_reward": 0.9233437776565552, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1953125, "step": 920, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 483.078125, "epoch": 0.06338173559975226, "grad_norm": 0.8808029269609164, "kl": 0.59765625, "learning_rate": 9.90120553736577e-07, "loss": 0.0, "reward": 1.9539809226989746, "reward_std": 0.05722030997276306, "rewards/accuracy_reward": 0.8102308511734009, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 921, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 484.953125, "epoch": 0.0634505539880256, "grad_norm": 1.008806246247344, "kl": 0.55078125, "learning_rate": 9.900991594754151e-07, "loss": -0.0, "reward": 2.3921875953674316, "reward_std": 0.11048543453216553, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 922, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 464.828125, "epoch": 0.06351937237629894, "grad_norm": 0.7250879571467229, "kl": 0.609375, "learning_rate": 9.900777423059616e-07, "loss": 0.0, "reward": 1.821874976158142, "reward_std": 0.12182654440402985, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 923, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 462.359375, "epoch": 0.0635881907645723, "grad_norm": 0.0, "kl": 0.55078125, "learning_rate": 9.90056302229218e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 924, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 463.78125, "epoch": 0.06365700915284564, "grad_norm": 0.849959478335177, "kl": 0.5703125, "learning_rate": 9.900348392461866e-07, "loss": -0.0, "reward": 2.543466567993164, "reward_std": 0.019863460212945938, "rewards/accuracy_reward": 0.8434665203094482, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 925, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 449.890625, "epoch": 0.06372582754111898, "grad_norm": 0.0, "kl": 0.6328125, "learning_rate": 9.900133533578703e-07, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 926, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 466.5, "epoch": 0.06379464592939234, "grad_norm": 0.0, "kl": 0.55859375, "learning_rate": 9.899918445652737e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 927, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 496.359375, "epoch": 0.06386346431766568, "grad_norm": 2.3737832505722087, "kl": 0.62890625, "learning_rate": 9.899703128694017e-07, "loss": 0.0, "reward": 2.2125000953674316, "reward_std": 0.26953840255737305, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 928, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 433.46875, "epoch": 0.06393228270593902, "grad_norm": 0.49616564350521636, "kl": 0.63671875, "learning_rate": 9.899487582712611e-07, "loss": -0.0, "reward": 2.1049365997314453, "reward_std": 0.010730026289820671, "rewards/accuracy_reward": 0.904936671257019, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 929, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 477.234375, "epoch": 0.06400110109421238, "grad_norm": 0.8799359692525206, "kl": 0.5546875, "learning_rate": 9.899271807718594e-07, "loss": 0.0, "reward": 1.9693151712417603, "reward_std": 0.057058099657297134, "rewards/accuracy_reward": 0.7974401116371155, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 930, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 468.140625, "epoch": 0.06406991948248572, "grad_norm": 0.9070897982817371, "kl": 0.59375, "learning_rate": 9.89905580372205e-07, "loss": -0.0, "reward": 1.795839786529541, "reward_std": 0.02461368404328823, "rewards/accuracy_reward": 0.6208398342132568, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 931, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 463.71875, "epoch": 0.06413873787075906, "grad_norm": 0.9703315805246113, "kl": 0.609375, "learning_rate": 9.89883957073308e-07, "loss": -0.0, "reward": 2.5841641426086426, "reward_std": 0.0734838992357254, "rewards/accuracy_reward": 0.8904142379760742, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 932, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 456.984375, "epoch": 0.06420755625903242, "grad_norm": 0.0, "kl": 0.5859375, "learning_rate": 9.898623108761786e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 933, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 438.71875, "epoch": 0.06427637464730576, "grad_norm": 6.0674533270673106, "kl": 0.609375, "learning_rate": 9.898406417818285e-07, "loss": 0.0, "reward": 1.5741889476776123, "reward_std": 0.18739652633666992, "rewards/accuracy_reward": 0.4616888761520386, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 934, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 483.140625, "epoch": 0.0643451930355791, "grad_norm": 0.0, "kl": 0.66015625, "learning_rate": 9.898189497912711e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 935, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 474.390625, "epoch": 0.06441401142385246, "grad_norm": 0.4642997632927709, "kl": 0.546875, "learning_rate": 9.8979723490552e-07, "loss": -0.0, "reward": 1.9842987060546875, "reward_std": 0.009950222447514534, "rewards/accuracy_reward": 0.8092986345291138, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 936, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 505.046875, "epoch": 0.0644828298121258, "grad_norm": 0.7008100544076569, "kl": 0.609375, "learning_rate": 9.897754971255902e-07, "loss": 0.0, "reward": 2.4733428955078125, "reward_std": 0.05656488612294197, "rewards/accuracy_reward": 0.9014678001403809, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 937, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 475.546875, "epoch": 0.06455164820039914, "grad_norm": 0.7316942542024382, "kl": 0.66796875, "learning_rate": 9.89753736452498e-07, "loss": -0.0, "reward": 2.3687500953674316, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 938, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 397.609375, "epoch": 0.06462046658867249, "grad_norm": 0.3923224881718391, "kl": 0.6796875, "learning_rate": 9.897319528872603e-07, "loss": 0.0, "reward": 2.1968750953674316, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 939, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 456.828125, "epoch": 0.06468928497694584, "grad_norm": 3.074247296022415, "kl": 0.51171875, "learning_rate": 9.897101464308954e-07, "loss": -0.0, "reward": 1.7766942977905273, "reward_std": 0.10900060087442398, "rewards/accuracy_reward": 0.6141942143440247, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 940, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 406.984375, "epoch": 0.06475810336521919, "grad_norm": 0.7106771281350159, "kl": 0.6640625, "learning_rate": 9.896883170844226e-07, "loss": 0.0, "reward": 2.1425442695617676, "reward_std": 0.014005481265485287, "rewards/accuracy_reward": 0.9487943649291992, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 941, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 456.09375, "epoch": 0.06482692175349253, "grad_norm": 1.1303648987653483, "kl": 0.65234375, "learning_rate": 9.89666464848862e-07, "loss": 0.0, "reward": 2.442187547683716, "reward_std": 0.06811804324388504, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4296875, "step": 942, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 396.40625, "epoch": 0.06489574014176588, "grad_norm": 0.6722563380901782, "kl": 0.671875, "learning_rate": 9.896445897252354e-07, "loss": 0.0, "reward": 1.896875023841858, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 943, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 459.640625, "epoch": 0.06496455853003923, "grad_norm": 0.5949972032117138, "kl": 0.66015625, "learning_rate": 9.89622691714565e-07, "loss": -0.0, "reward": 2.3421876430511475, "reward_std": 0.10227546095848083, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3671875, "step": 944, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 465.671875, "epoch": 0.06503337691831257, "grad_norm": 0.9604976557235609, "kl": 0.6796875, "learning_rate": 9.896007708178747e-07, "loss": -0.0, "reward": 2.296875, "reward_std": 0.12182655930519104, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 945, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 408.875, "epoch": 0.06510219530658592, "grad_norm": 0.7147071284161348, "kl": 0.64453125, "learning_rate": 9.895788270361888e-07, "loss": -0.0, "reward": 1.8928241729736328, "reward_std": 0.053033001720905304, "rewards/accuracy_reward": 0.7459490895271301, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 946, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 458.828125, "epoch": 0.06517101369485927, "grad_norm": 0.551503337290617, "kl": 0.63671875, "learning_rate": 9.895568603705333e-07, "loss": -0.0, "reward": 2.3307278156280518, "reward_std": 0.005144009832292795, "rewards/accuracy_reward": 0.6807279586791992, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 947, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 443.046875, "epoch": 0.06523983208313261, "grad_norm": 0.46459373201780296, "kl": 0.6484375, "learning_rate": 9.895348708219346e-07, "loss": -0.0, "reward": 1.9375, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 948, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 435.21875, "epoch": 0.06530865047140597, "grad_norm": 0.8079347719866313, "kl": 0.58203125, "learning_rate": 9.895128583914208e-07, "loss": 0.0, "reward": 2.125, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 949, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 433.1875, "epoch": 0.06537746885967931, "grad_norm": 1.358275350924688, "kl": 0.6953125, "learning_rate": 9.894908230800205e-07, "loss": -0.0, "reward": 2.069965362548828, "reward_std": 0.10227546840906143, "rewards/accuracy_reward": 0.4652777910232544, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 950, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 416.46875, "epoch": 0.06544628724795265, "grad_norm": 0.4950432081802501, "kl": 0.68359375, "learning_rate": 9.894687648887641e-07, "loss": 0.0, "reward": 2.4796876907348633, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4296875, "step": 951, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 428.375, "epoch": 0.065515105636226, "grad_norm": 0.9325292413458414, "kl": 0.609375, "learning_rate": 9.894466838186826e-07, "loss": 0.0, "reward": 1.6977746486663818, "reward_std": 0.04946090281009674, "rewards/accuracy_reward": 0.5727747678756714, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 952, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 443.75, "epoch": 0.06558392402449935, "grad_norm": 0.0, "kl": 0.5859375, "learning_rate": 9.894245798708078e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 953, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 421.0, "epoch": 0.06565274241277269, "grad_norm": 0.5146532254865921, "kl": 0.7109375, "learning_rate": 9.89402453046173e-07, "loss": -0.0, "reward": 2.2750000953674316, "reward_std": 0.09209854900836945, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.28125, "step": 954, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 460.703125, "epoch": 0.06572156080104603, "grad_norm": 0.41820536125837593, "kl": 0.5703125, "learning_rate": 9.893803033458127e-07, "loss": 0.0, "reward": 2.3209543228149414, "reward_std": 0.0031149203423410654, "rewards/accuracy_reward": 0.6709543466567993, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 955, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 445.234375, "epoch": 0.06579037918931939, "grad_norm": 1.0666018815935863, "kl": 0.69140625, "learning_rate": 9.893581307707618e-07, "loss": 0.0, "reward": 2.3937501907348633, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 956, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 409.09375, "epoch": 0.06585919757759273, "grad_norm": 0.0, "kl": 0.65625, "learning_rate": 9.893359353220572e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 957, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 400.09375, "epoch": 0.06592801596586607, "grad_norm": 0.0, "kl": 0.65625, "learning_rate": 9.89313717000736e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 958, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 430.0, "epoch": 0.06599683435413943, "grad_norm": 1.1609781125122773, "kl": 0.59765625, "learning_rate": 9.892914758078368e-07, "loss": -0.0, "reward": 1.9937500953674316, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 959, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 431.296875, "epoch": 0.06606565274241277, "grad_norm": 1.5582429041072514, "kl": 0.72265625, "learning_rate": 9.892692117443992e-07, "loss": -0.0, "reward": 2.203125, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 960, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 415.875, "epoch": 0.06613447113068611, "grad_norm": 5.223098984753251, "kl": 0.625, "learning_rate": 9.892469248114639e-07, "loss": -0.0, "reward": 2.125, "reward_std": 0.08017837256193161, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 961, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.25, "epoch": 0.06620328951895947, "grad_norm": 1.0158088804535375, "kl": 0.66796875, "learning_rate": 9.892246150100727e-07, "loss": 0.0, "reward": 2.2718751430511475, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 962, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 476.921875, "epoch": 0.06627210790723281, "grad_norm": 1.0170429046869534, "kl": 0.65625, "learning_rate": 9.892022823412685e-07, "loss": -0.0, "reward": 2.2707252502441406, "reward_std": 0.0834321528673172, "rewards/accuracy_reward": 0.7332252860069275, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 963, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 467.5625, "epoch": 0.06634092629550616, "grad_norm": 3.6087069730180024, "kl": 0.546875, "learning_rate": 9.891799268060948e-07, "loss": -0.0, "reward": 1.896674633026123, "reward_std": 0.010341096669435501, "rewards/accuracy_reward": 0.7216745615005493, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 964, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 447.328125, "epoch": 0.06640974468377951, "grad_norm": 0.7044849290101102, "kl": 0.6640625, "learning_rate": 9.891575484055968e-07, "loss": 0.0, "reward": 2.3187499046325684, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 965, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 434.046875, "epoch": 0.06647856307205285, "grad_norm": 0.49634490209276155, "kl": 0.59765625, "learning_rate": 9.891351471408204e-07, "loss": 0.0, "reward": 2.5656251907348633, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 966, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 416.625, "epoch": 0.0665473814603262, "grad_norm": 0.7967340212881753, "kl": 0.6484375, "learning_rate": 9.891127230128131e-07, "loss": 0.0, "reward": 2.125, "reward_std": 0.08017837256193161, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 967, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 409.078125, "epoch": 0.06661619984859955, "grad_norm": 0.666117908892424, "kl": 0.65625, "learning_rate": 9.890902760226224e-07, "loss": 0.0, "reward": 2.6212286949157715, "reward_std": 0.007702291943132877, "rewards/accuracy_reward": 0.9212287664413452, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 968, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 442.75, "epoch": 0.0666850182368729, "grad_norm": 1.1496228113983953, "kl": 0.6796875, "learning_rate": 9.89067806171298e-07, "loss": -0.0, "reward": 2.3046875, "reward_std": 0.0915335938334465, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3671875, "step": 969, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 442.078125, "epoch": 0.06675383662514624, "grad_norm": 0.6286019875816509, "kl": 0.546875, "learning_rate": 9.8904531345989e-07, "loss": 0.0, "reward": 2.3048057556152344, "reward_std": 0.01274784654378891, "rewards/accuracy_reward": 0.654805600643158, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 970, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 471.984375, "epoch": 0.06682265501341958, "grad_norm": 0.0, "kl": 0.5546875, "learning_rate": 9.890227978894498e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 971, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 428.15625, "epoch": 0.06689147340169294, "grad_norm": 0.6538572627132377, "kl": 0.625, "learning_rate": 9.890002594610298e-07, "loss": -0.0, "reward": 1.6300108432769775, "reward_std": 0.07248764485120773, "rewards/accuracy_reward": 0.5112608075141907, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 972, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 422.25, "epoch": 0.06696029178996628, "grad_norm": 1.0683210396694907, "kl": 0.64453125, "learning_rate": 9.889776981756837e-07, "loss": -0.0, "reward": 1.951244831085205, "reward_std": 0.1286695897579193, "rewards/accuracy_reward": 0.7824949026107788, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 973, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 406.703125, "epoch": 0.06702911017823962, "grad_norm": 1.1014353679726088, "kl": 0.65625, "learning_rate": 9.889551140344656e-07, "loss": -0.0, "reward": 2.1437501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 974, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 404.6875, "epoch": 0.06709792856651298, "grad_norm": 0.834761779610865, "kl": 0.67578125, "learning_rate": 9.889325070384316e-07, "loss": 0.0, "reward": 2.546875, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 975, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 411.140625, "epoch": 0.06716674695478632, "grad_norm": 0.0, "kl": 0.6640625, "learning_rate": 9.88909877188638e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 976, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 482.65625, "epoch": 0.06723556534305966, "grad_norm": 0.3788155060702713, "kl": 0.5390625, "learning_rate": 9.888872244861431e-07, "loss": -0.0, "reward": 1.8277432918548584, "reward_std": 0.0029470277950167656, "rewards/accuracy_reward": 0.6777431964874268, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 977, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 471.171875, "epoch": 0.06730438373133302, "grad_norm": 0.784472397997428, "kl": 0.6328125, "learning_rate": 9.88864548932005e-07, "loss": 0.0, "reward": 2.4864449501037598, "reward_std": 0.0492275208234787, "rewards/accuracy_reward": 0.8114448189735413, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 978, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 445.046875, "epoch": 0.06737320211960636, "grad_norm": 1.000099551729977, "kl": 0.5859375, "learning_rate": 9.888418505272843e-07, "loss": 0.0, "reward": 2.396523952484131, "reward_std": 0.07633928954601288, "rewards/accuracy_reward": 0.7402740120887756, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 979, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 466.453125, "epoch": 0.0674420205078797, "grad_norm": 1.5559558931492659, "kl": 0.69140625, "learning_rate": 9.888191292730417e-07, "loss": -0.0, "reward": 2.25, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3125, "step": 980, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 458.953125, "epoch": 0.06751083889615306, "grad_norm": 1.1299978562869741, "kl": 0.578125, "learning_rate": 9.88796385170339e-07, "loss": -0.0, "reward": 2.32412052154541, "reward_std": 0.0032515546772629023, "rewards/accuracy_reward": 0.6741204261779785, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 981, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 445.96875, "epoch": 0.0675796572844264, "grad_norm": 1.261211154848303, "kl": 0.6015625, "learning_rate": 9.887736182202398e-07, "loss": 0.0, "reward": 2.1812500953674316, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 982, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 474.0625, "epoch": 0.06764847567269974, "grad_norm": 0.6063802282569893, "kl": 0.5546875, "learning_rate": 9.887508284238077e-07, "loss": 0.0, "reward": 2.0627684593200684, "reward_std": 0.07422668486833572, "rewards/accuracy_reward": 0.45651865005493164, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 983, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 445.75, "epoch": 0.0677172940609731, "grad_norm": 2.9188409429836786, "kl": 0.60546875, "learning_rate": 9.887280157821085e-07, "loss": -0.0, "reward": 1.8639694452285767, "reward_std": 0.01830437034368515, "rewards/accuracy_reward": 0.6889694333076477, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 984, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 419.46875, "epoch": 0.06778611244924644, "grad_norm": 1.3123149913145382, "kl": 0.6484375, "learning_rate": 9.887051802962084e-07, "loss": 0.0, "reward": 1.932687520980835, "reward_std": 0.0893261730670929, "rewards/accuracy_reward": 0.7701875567436218, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 985, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 461.9375, "epoch": 0.06785493083751978, "grad_norm": 0.7462957519848863, "kl": 0.67578125, "learning_rate": 9.886823219671743e-07, "loss": 0.0, "reward": 2.1796875, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4296875, "step": 986, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 412.453125, "epoch": 0.06792374922579313, "grad_norm": 0.7498536286398747, "kl": 0.66015625, "learning_rate": 9.886594407960752e-07, "loss": 0.0, "reward": 2.1321821212768555, "reward_std": 0.008904434740543365, "rewards/accuracy_reward": 0.9321820735931396, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 987, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 479.59375, "epoch": 0.06799256761406648, "grad_norm": 0.9458590200764406, "kl": 0.56640625, "learning_rate": 9.886365367839804e-07, "loss": -0.0, "reward": 2.4889254570007324, "reward_std": 0.04845817759633064, "rewards/accuracy_reward": 0.8139253854751587, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 988, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 421.78125, "epoch": 0.06806138600233982, "grad_norm": 0.0, "kl": 0.66796875, "learning_rate": 9.886136099319604e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 989, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 465.609375, "epoch": 0.06813020439061317, "grad_norm": 0.0, "kl": 0.6875, "learning_rate": 9.885906602410871e-07, "loss": 0.0, "reward": 2.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 990, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 449.703125, "epoch": 0.06819902277888652, "grad_norm": 1.0259991852489556, "kl": 0.58984375, "learning_rate": 9.88567687712433e-07, "loss": 0.0, "reward": 1.7425193786621094, "reward_std": 0.04641212522983551, "rewards/accuracy_reward": 0.6175193786621094, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 991, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 463.171875, "epoch": 0.06826784116715987, "grad_norm": 0.6864838117389178, "kl": 0.671875, "learning_rate": 9.88544692347072e-07, "loss": -0.0, "reward": 2.3195691108703613, "reward_std": 0.05047547444701195, "rewards/accuracy_reward": 0.669569194316864, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 992, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 439.015625, "epoch": 0.06833665955543321, "grad_norm": 0.0, "kl": 0.64453125, "learning_rate": 9.885216741460787e-07, "loss": 0.0, "reward": 2.391913652420044, "reward_std": 0.0, "rewards/accuracy_reward": 0.7419135570526123, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 993, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 427.59375, "epoch": 0.06840547794370656, "grad_norm": 0.9131165876171125, "kl": 0.66015625, "learning_rate": 9.884986331105293e-07, "loss": -0.0, "reward": 2.0638070106506348, "reward_std": 0.07374168187379837, "rewards/accuracy_reward": 0.8669319152832031, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 994, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 449.9375, "epoch": 0.0684742963319799, "grad_norm": 0.6397832904943728, "kl": 0.640625, "learning_rate": 9.884755692415007e-07, "loss": -0.0, "reward": 1.8117716312408447, "reward_std": 0.00435929000377655, "rewards/accuracy_reward": 0.6617716550827026, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 995, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 420.375, "epoch": 0.06854311472025325, "grad_norm": 0.713703397792915, "kl": 0.6796875, "learning_rate": 9.884524825400711e-07, "loss": 0.0, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 996, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 412.203125, "epoch": 0.0686119331085266, "grad_norm": 0.733057124428289, "kl": 0.69140625, "learning_rate": 9.884293730073195e-07, "loss": 0.0, "reward": 2.125, "reward_std": 0.08017838001251221, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 997, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 474.875, "epoch": 0.06868075149679995, "grad_norm": 0.6722062195107297, "kl": 0.5703125, "learning_rate": 9.884062406443258e-07, "loss": 0.0, "reward": 2.4474949836730957, "reward_std": 0.0798250064253807, "rewards/accuracy_reward": 0.7943702340126038, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 998, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 446.9375, "epoch": 0.06874956988507329, "grad_norm": 0.0, "kl": 0.6171875, "learning_rate": 9.88383085452172e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 999, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 463.609375, "epoch": 0.06881838827334665, "grad_norm": 1.4624739033975136, "kl": 0.6171875, "learning_rate": 9.883599074319394e-07, "loss": 0.0, "reward": 1.6332879066467285, "reward_std": 0.14692902565002441, "rewards/accuracy_reward": 0.5176628828048706, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1000, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 436.375, "epoch": 0.06888720666161999, "grad_norm": 0.8967600268985603, "kl": 0.6484375, "learning_rate": 9.883367065847124e-07, "loss": -0.0, "reward": 1.9147748947143555, "reward_std": 0.02106022834777832, "rewards/accuracy_reward": 0.7397748827934265, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1001, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 412.53125, "epoch": 0.06895602504989333, "grad_norm": 13.127708824451814, "kl": 0.6796875, "learning_rate": 9.883134829115749e-07, "loss": 0.0, "reward": 2.1031250953674316, "reward_std": 0.08647122234106064, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1002, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 470.65625, "epoch": 0.06902484343816667, "grad_norm": 0.0, "kl": 0.56640625, "learning_rate": 9.882902364136124e-07, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1003, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 448.890625, "epoch": 0.06909366182644003, "grad_norm": 2.2749061547303, "kl": 0.625, "learning_rate": 9.882669670919116e-07, "loss": 0.0, "reward": 2.1556060314178467, "reward_std": 0.12409966439008713, "rewards/accuracy_reward": 0.5399811267852783, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1004, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 443.84375, "epoch": 0.06916248021471337, "grad_norm": 0.5275334815101851, "kl": 0.70703125, "learning_rate": 9.882436749475602e-07, "loss": -0.0, "reward": 2.065624952316284, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1005, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 455.59375, "epoch": 0.06923129860298671, "grad_norm": 0.8439610770320979, "kl": 0.69921875, "learning_rate": 9.882203599816472e-07, "loss": 0.0, "reward": 1.7152094841003418, "reward_std": 0.09827403724193573, "rewards/accuracy_reward": 0.5839595794677734, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1006, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 449.140625, "epoch": 0.06930011699126007, "grad_norm": 92.48883781985799, "kl": 0.7109375, "learning_rate": 9.881970221952618e-07, "loss": -0.0, "reward": 1.84375, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1007, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 465.390625, "epoch": 0.06936893537953341, "grad_norm": 0.5726049229142472, "kl": 0.69921875, "learning_rate": 9.88173661589495e-07, "loss": 0.0, "reward": 1.896875023841858, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1008, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 471.453125, "epoch": 0.06943775376780675, "grad_norm": 0.0, "kl": 0.60546875, "learning_rate": 9.881502781654393e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1009, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 480.65625, "epoch": 0.06950657215608011, "grad_norm": 0.8375329808779389, "kl": 0.60546875, "learning_rate": 9.88126871924187e-07, "loss": -0.0, "reward": 2.0160317420959473, "reward_std": 0.09316806495189667, "rewards/accuracy_reward": 0.8379067182540894, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1010, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 498.0, "epoch": 0.06957539054435345, "grad_norm": 1.6563542872740387, "kl": 0.56640625, "learning_rate": 9.881034428668325e-07, "loss": 0.0, "reward": 1.9768705368041992, "reward_std": 0.0036034013610333204, "rewards/accuracy_reward": 0.8018704652786255, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1011, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 477.109375, "epoch": 0.0696442089326268, "grad_norm": 0.4578608153161652, "kl": 0.59765625, "learning_rate": 9.880799909944708e-07, "loss": -0.0, "reward": 2.4562501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1012, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 492.1875, "epoch": 0.06971302732090015, "grad_norm": 1.663600610319303, "kl": 0.6796875, "learning_rate": 9.880565163081982e-07, "loss": 0.0, "reward": 2.3382515907287598, "reward_std": 0.34402555227279663, "rewards/accuracy_reward": 0.6991890668869019, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 1013, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 420.859375, "epoch": 0.06978184570917349, "grad_norm": 0.43203434588661555, "kl": 0.6640625, "learning_rate": 9.88033018809112e-07, "loss": 0.0, "reward": 1.780439853668213, "reward_std": 0.016017932444810867, "rewards/accuracy_reward": 0.6398147940635681, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1014, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 539.59375, "epoch": 0.06985066409744684, "grad_norm": 0.6466749354408561, "kl": 0.53125, "learning_rate": 9.880094984983104e-07, "loss": 0.0, "reward": 2.4259586334228516, "reward_std": 0.09319337457418442, "rewards/accuracy_reward": 0.7884586453437805, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.5, "step": 1015, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 489.609375, "epoch": 0.06991948248572019, "grad_norm": 1.2708679788534492, "kl": 0.69140625, "learning_rate": 9.879859553768925e-07, "loss": 0.0, "reward": 2.379687547683716, "reward_std": 0.12504829466342926, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3515625, "step": 1016, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 488.25, "epoch": 0.06998830087399353, "grad_norm": 0.0, "kl": 0.703125, "learning_rate": 9.879623894459594e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1017, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 472.4375, "epoch": 0.07005711926226688, "grad_norm": 2.0766238756989193, "kl": 0.71484375, "learning_rate": 9.87938800706612e-07, "loss": 0.0, "reward": 2.010937452316284, "reward_std": 0.1771668791770935, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3203125, "step": 1018, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 501.109375, "epoch": 0.07012593765054022, "grad_norm": 0.0, "kl": 0.59375, "learning_rate": 9.879151891599535e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1019, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 477.21875, "epoch": 0.07019475603881357, "grad_norm": 0.0, "kl": 0.73046875, "learning_rate": 9.878915548070873e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1020, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 509.21875, "epoch": 0.07026357442708692, "grad_norm": 0.5649938084818281, "kl": 0.59765625, "learning_rate": 9.878678976491178e-07, "loss": -0.0, "reward": 2.0170140266418457, "reward_std": 0.12182654440402985, "rewards/accuracy_reward": 0.8420138955116272, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1021, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 498.6875, "epoch": 0.07033239281536026, "grad_norm": 0.42984893685366554, "kl": 0.62109375, "learning_rate": 9.878442176871511e-07, "loss": -0.0, "reward": 2.174017906188965, "reward_std": 0.0059388987720012665, "rewards/accuracy_reward": 0.5490177869796753, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1022, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 498.4375, "epoch": 0.07040121120363362, "grad_norm": 0.738273631137754, "kl": 0.6015625, "learning_rate": 9.87820514922294e-07, "loss": -0.0, "reward": 2.5875000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1023, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 507.03125, "epoch": 0.07047002959190696, "grad_norm": 1.8758642567389474, "kl": 0.609375, "learning_rate": 9.877967893556545e-07, "loss": -0.0, "reward": 1.6162090301513672, "reward_std": 0.12649743258953094, "rewards/accuracy_reward": 0.5162088871002197, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.0, "step": 1024, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 541.84375, "epoch": 0.0705388479801803, "grad_norm": 1.2264839678152029, "kl": 0.55078125, "learning_rate": 9.877730409883413e-07, "loss": 0.0, "reward": 2.2430717945098877, "reward_std": 0.279762864112854, "rewards/accuracy_reward": 0.6368217468261719, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.5, "step": 1025, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 424.828125, "epoch": 0.07060766636845366, "grad_norm": 0.5206505484638786, "kl": 0.703125, "learning_rate": 9.877492698214648e-07, "loss": -0.0, "reward": 1.9968663454055786, "reward_std": 0.014965700916945934, "rewards/accuracy_reward": 0.8218662738800049, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1026, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 474.609375, "epoch": 0.070676484756727, "grad_norm": 0.477410866775264, "kl": 0.7265625, "learning_rate": 9.87725475856136e-07, "loss": -0.0, "reward": 2.580550193786621, "reward_std": 0.0170146431773901, "rewards/accuracy_reward": 0.8899250030517578, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1027, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 472.046875, "epoch": 0.07074530314500034, "grad_norm": 0.7767453696571635, "kl": 0.69140625, "learning_rate": 9.87701659093467e-07, "loss": -0.0, "reward": 2.531358242034912, "reward_std": 0.060314666479825974, "rewards/accuracy_reward": 0.8969833850860596, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1028, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 498.875, "epoch": 0.0708141215332737, "grad_norm": 1.1012351018628292, "kl": 0.59375, "learning_rate": 9.876778195345712e-07, "loss": 0.0, "reward": 2.269874334335327, "reward_std": 0.24536806344985962, "rewards/accuracy_reward": 0.626124382019043, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1029, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 434.125, "epoch": 0.07088293992154704, "grad_norm": 0.0, "kl": 0.69921875, "learning_rate": 9.876539571805627e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1030, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 471.359375, "epoch": 0.07095175830982038, "grad_norm": 0.8482187097676797, "kl": 0.62890625, "learning_rate": 9.87630072032557e-07, "loss": -0.0, "reward": 1.878989577293396, "reward_std": 0.07600992918014526, "rewards/accuracy_reward": 0.722739577293396, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1031, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 471.9375, "epoch": 0.07102057669809374, "grad_norm": 0.5488078608145791, "kl": 0.76171875, "learning_rate": 9.876061640916703e-07, "loss": 0.0, "reward": 1.9562500715255737, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1032, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 482.140625, "epoch": 0.07108939508636708, "grad_norm": 0.0, "kl": 0.6328125, "learning_rate": 9.875822333590207e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1033, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 471.09375, "epoch": 0.07115821347464042, "grad_norm": 0.961541420347824, "kl": 0.734375, "learning_rate": 9.875582798357262e-07, "loss": -0.0, "reward": 2.21147084236145, "reward_std": 0.1070631891489029, "rewards/accuracy_reward": 0.6895958185195923, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1034, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 491.703125, "epoch": 0.07122703186291376, "grad_norm": 1.164554184127639, "kl": 0.75, "learning_rate": 9.875343035229068e-07, "loss": -0.0, "reward": 1.928653597831726, "reward_std": 0.009652992710471153, "rewards/accuracy_reward": 0.7536535263061523, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1035, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 433.46875, "epoch": 0.07129585025118712, "grad_norm": 1.216315098983686, "kl": 0.6875, "learning_rate": 9.87510304421683e-07, "loss": 0.0, "reward": 1.775404691696167, "reward_std": 0.07893675565719604, "rewards/accuracy_reward": 0.6129047870635986, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1036, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 489.390625, "epoch": 0.07136466863946046, "grad_norm": 0.0, "kl": 0.75, "learning_rate": 9.874862825331766e-07, "loss": 0.0, "reward": 2.637500047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1037, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 514.25, "epoch": 0.0714334870277338, "grad_norm": 1.8925479442052808, "kl": 0.7265625, "learning_rate": 9.874622378585104e-07, "loss": 0.0, "reward": 2.3734374046325684, "reward_std": 0.14394861459732056, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3046875, "step": 1038, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 558.25, "epoch": 0.07150230541600716, "grad_norm": 0.7524385247790438, "kl": 0.59375, "learning_rate": 9.874381703988083e-07, "loss": 0.0, "reward": 2.0322189331054688, "reward_std": 0.07896329462528229, "rewards/accuracy_reward": 0.8509688377380371, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1039, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 429.515625, "epoch": 0.0715711238042805, "grad_norm": 0.0, "kl": 0.7265625, "learning_rate": 9.874140801551956e-07, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1040, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 424.453125, "epoch": 0.07163994219255385, "grad_norm": 0.722072200031621, "kl": 0.703125, "learning_rate": 9.873899671287977e-07, "loss": 0.0, "reward": 1.8937500715255737, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1041, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 534.984375, "epoch": 0.0717087605808272, "grad_norm": 0.9900088469827407, "kl": 0.63671875, "learning_rate": 9.873658313207423e-07, "loss": -0.0, "reward": 2.3187499046325684, "reward_std": 0.15782484412193298, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1042, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 597.125, "epoch": 0.07177757896910054, "grad_norm": 1.1519044772822298, "kl": 0.6953125, "learning_rate": 9.873416727321573e-07, "loss": 0.0, "reward": 2.2991561889648438, "reward_std": 0.22168666124343872, "rewards/accuracy_reward": 0.7929061055183411, "rewards/format_reward": 0.90625, "rewards/transform_reward": 0.4375, "step": 1043, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 570.90625, "epoch": 0.07184639735737389, "grad_norm": 2.69230346276844, "kl": 0.6875, "learning_rate": 9.873174913641717e-07, "loss": 0.0, "reward": 2.145312547683716, "reward_std": 0.3444482684135437, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.2265625, "step": 1044, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 552.0625, "epoch": 0.07191521574564724, "grad_norm": 0.0, "kl": 0.68359375, "learning_rate": 9.872932872179163e-07, "loss": 0.0, "reward": 2.575000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1045, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 453.890625, "epoch": 0.07198403413392059, "grad_norm": 0.0, "kl": 0.7265625, "learning_rate": 9.87269060294522e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1046, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 557.796875, "epoch": 0.07205285252219393, "grad_norm": 0.0, "kl": 0.6171875, "learning_rate": 9.872448105951215e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1047, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 491.0, "epoch": 0.07212167091046728, "grad_norm": 1.2451385853003172, "kl": 0.68359375, "learning_rate": 9.872205381208481e-07, "loss": -0.0, "reward": 2.4328126907348633, "reward_std": 0.19950264692306519, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1048, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 573.328125, "epoch": 0.07219048929874063, "grad_norm": 0.48986114543955395, "kl": 0.59765625, "learning_rate": 9.871962428728365e-07, "loss": -0.0, "reward": 2.4562501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1049, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 484.234375, "epoch": 0.07225930768701397, "grad_norm": 0.0, "kl": 0.6796875, "learning_rate": 9.871719248522221e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1050, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 524.0, "epoch": 0.07232812607528731, "grad_norm": 0.6379437435703089, "kl": 0.63671875, "learning_rate": 9.87147584060142e-07, "loss": -0.0, "reward": 1.84796941280365, "reward_std": 0.09589764475822449, "rewards/accuracy_reward": 0.7104693055152893, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 1051, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 543.1875, "epoch": 0.07239694446356067, "grad_norm": 0.7941361603328368, "kl": 0.62109375, "learning_rate": 9.871232204977333e-07, "loss": 0.0, "reward": 2.606250047683716, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1052, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 563.515625, "epoch": 0.07246576285183401, "grad_norm": 0.0, "kl": 0.609375, "learning_rate": 9.870988341661354e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1053, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 493.578125, "epoch": 0.07253458124010735, "grad_norm": 0.0, "kl": 0.6796875, "learning_rate": 9.870744250664878e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1054, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 539.875, "epoch": 0.07260339962838071, "grad_norm": 2.035964653107079, "kl": 0.65234375, "learning_rate": 9.870499931999316e-07, "loss": -0.0, "reward": 1.8861356973648071, "reward_std": 0.08441835641860962, "rewards/accuracy_reward": 0.7298856377601624, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1055, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 538.078125, "epoch": 0.07267221801665405, "grad_norm": 0.0, "kl": 0.58984375, "learning_rate": 9.870255385676088e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1056, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 558.578125, "epoch": 0.07274103640492739, "grad_norm": 0.6241709152548898, "kl": 0.609375, "learning_rate": 9.870010611706625e-07, "loss": -0.0, "reward": 2.4140625, "reward_std": 0.13572776317596436, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1057, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 491.09375, "epoch": 0.07280985479320075, "grad_norm": 0.0, "kl": 0.67578125, "learning_rate": 9.869765610102366e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1058, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 579.203125, "epoch": 0.07287867318147409, "grad_norm": 0.9890367423732086, "kl": 0.671875, "learning_rate": 9.869520380874766e-07, "loss": -0.0, "reward": 2.0283992290496826, "reward_std": 0.11391721665859222, "rewards/accuracy_reward": 0.42839932441711426, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.484375, "step": 1059, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 505.046875, "epoch": 0.07294749156974743, "grad_norm": 1.3013159724175327, "kl": 0.6640625, "learning_rate": 9.869274924035284e-07, "loss": -0.0, "reward": 2.3799984455108643, "reward_std": 0.1612435281276703, "rewards/accuracy_reward": 0.7081236243247986, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1060, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 580.6875, "epoch": 0.07301630995802079, "grad_norm": 0.8988550558850684, "kl": 0.6484375, "learning_rate": 9.869029239595396e-07, "loss": 0.0, "reward": 2.0118725299835205, "reward_std": 0.19765204191207886, "rewards/accuracy_reward": 0.8868725299835205, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.0, "step": 1061, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 550.46875, "epoch": 0.07308512834629413, "grad_norm": 2.1430463528215795, "kl": 0.625, "learning_rate": 9.868783327566585e-07, "loss": 0.0, "reward": 1.962499976158142, "reward_std": 0.2474873811006546, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 1062, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 588.203125, "epoch": 0.07315394673456747, "grad_norm": 0.8589715578661373, "kl": 0.67578125, "learning_rate": 9.868537187960344e-07, "loss": -0.0, "reward": 2.0135951042175293, "reward_std": 0.12717898190021515, "rewards/accuracy_reward": 0.8354701995849609, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1063, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 576.390625, "epoch": 0.07322276512284083, "grad_norm": 0.47629393802291753, "kl": 0.67578125, "learning_rate": 9.868290820788182e-07, "loss": 0.0, "reward": 2.2313032150268555, "reward_std": 0.12512756884098053, "rewards/accuracy_reward": 0.6703656911849976, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4296875, "step": 1064, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 578.15625, "epoch": 0.07329158351111417, "grad_norm": 0.2687387713316765, "kl": 0.6640625, "learning_rate": 9.86804422606161e-07, "loss": 0.0, "reward": 2.1156249046325684, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.25, "step": 1065, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 503.203125, "epoch": 0.07336040189938751, "grad_norm": 1.2067587196436032, "kl": 0.6484375, "learning_rate": 9.86779740379216e-07, "loss": -0.0, "reward": 1.7094800472259521, "reward_std": 0.21509040892124176, "rewards/accuracy_reward": 0.6157301068305969, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.0, "step": 1066, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 523.515625, "epoch": 0.07342922028766086, "grad_norm": 0.43292215312225246, "kl": 0.6484375, "learning_rate": 9.867550353991364e-07, "loss": 0.0, "reward": 2.515625, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 1067, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 505.484375, "epoch": 0.07349803867593421, "grad_norm": 1.0733116708795924, "kl": 0.65625, "learning_rate": 9.86730307667077e-07, "loss": 0.0, "reward": 1.6823139190673828, "reward_std": 0.34012076258659363, "rewards/accuracy_reward": 0.6073137521743774, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.0, "step": 1068, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 536.859375, "epoch": 0.07356685706420756, "grad_norm": 0.9887555110774512, "kl": 0.640625, "learning_rate": 9.867055571841938e-07, "loss": -0.0, "reward": 2.4136414527893066, "reward_std": 0.6030233502388, "rewards/accuracy_reward": 0.8464537262916565, "rewards/format_reward": 0.921875, "rewards/transform_reward": 0.4609375, "step": 1069, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 503.703125, "epoch": 0.0736356754524809, "grad_norm": 0.3873422085026301, "kl": 0.6953125, "learning_rate": 9.866807839516435e-07, "loss": 0.0, "reward": 2.015625, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1070, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 508.03125, "epoch": 0.07370449384075425, "grad_norm": 0.0, "kl": 0.71484375, "learning_rate": 9.866559879705845e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1071, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 498.3125, "epoch": 0.0737733122290276, "grad_norm": 0.5212794385351687, "kl": 0.6796875, "learning_rate": 9.866311692421753e-07, "loss": -0.0, "reward": 2.6261658668518066, "reward_std": 0.0043990048579871655, "rewards/accuracy_reward": 0.9261656999588013, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1072, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 482.609375, "epoch": 0.07384213061730094, "grad_norm": 0.0, "kl": 0.703125, "learning_rate": 9.866063277675764e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1073, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 498.71875, "epoch": 0.0739109490055743, "grad_norm": 0.9175159324482239, "kl": 0.6953125, "learning_rate": 9.865814635479486e-07, "loss": 0.0, "reward": 2.494969367980957, "reward_std": 0.07456324994564056, "rewards/accuracy_reward": 0.8199692368507385, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1074, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 494.828125, "epoch": 0.07397976739384764, "grad_norm": 2.3278006501758384, "kl": 0.75, "learning_rate": 9.86556576584454e-07, "loss": -0.0, "reward": 2.4202380180358887, "reward_std": 0.08873279392719269, "rewards/accuracy_reward": 0.7389881610870361, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1075, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 515.0, "epoch": 0.07404858578212098, "grad_norm": 0.7006909881641865, "kl": 0.66796875, "learning_rate": 9.865316668782566e-07, "loss": 0.0, "reward": 1.929319143295288, "reward_std": 0.10058422386646271, "rewards/accuracy_reward": 0.7730691432952881, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1076, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 536.71875, "epoch": 0.07411740417039434, "grad_norm": 0.6145155940829898, "kl": 0.65234375, "learning_rate": 9.8650673443052e-07, "loss": -0.0, "reward": 2.349169969558716, "reward_std": 0.09740085899829865, "rewards/accuracy_reward": 0.7116698622703552, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.5, "step": 1077, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 479.75, "epoch": 0.07418622255866768, "grad_norm": 3.196613053244778, "kl": 0.74609375, "learning_rate": 9.8648177924241e-07, "loss": 0.0, "reward": 2.2484374046325684, "reward_std": 0.041185662150382996, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3515625, "step": 1078, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 478.75, "epoch": 0.07425504094694102, "grad_norm": 0.0, "kl": 0.7109375, "learning_rate": 9.864568013150928e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1079, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 484.828125, "epoch": 0.07432385933521438, "grad_norm": 1.7682921890861847, "kl": 0.7265625, "learning_rate": 9.864318006497358e-07, "loss": 0.0, "reward": 1.9718749523162842, "reward_std": 0.12182654440402985, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1080, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 482.65625, "epoch": 0.07439267772348772, "grad_norm": 3.265715572913659, "kl": 0.69921875, "learning_rate": 9.864067772475082e-07, "loss": 0.0, "reward": 2.4156851768493652, "reward_std": 0.09241241961717606, "rewards/accuracy_reward": 0.7500600814819336, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1081, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 530.828125, "epoch": 0.07446149611176106, "grad_norm": 0.8573133866190368, "kl": 0.64453125, "learning_rate": 9.86381731109579e-07, "loss": 0.0, "reward": 2.2292332649230957, "reward_std": 0.07917369902133942, "rewards/accuracy_reward": 0.5979831218719482, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 1082, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 495.828125, "epoch": 0.0745303145000344, "grad_norm": 0.0, "kl": 0.7109375, "learning_rate": 9.863566622371194e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1083, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 482.859375, "epoch": 0.07459913288830776, "grad_norm": 0.0, "kl": 0.7265625, "learning_rate": 9.863315706313006e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1084, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 502.734375, "epoch": 0.0746679512765811, "grad_norm": 0.9271642866990695, "kl": 0.6796875, "learning_rate": 9.86306456293296e-07, "loss": -0.0, "reward": 2.207812547683716, "reward_std": 0.1835799515247345, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1085, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 512.59375, "epoch": 0.07473676966485444, "grad_norm": 3.860714971482713, "kl": 0.69140625, "learning_rate": 9.862813192242794e-07, "loss": 0.0, "reward": 2.0361948013305664, "reward_std": 0.1361880600452423, "rewards/accuracy_reward": 0.8518197536468506, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1086, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 487.171875, "epoch": 0.0748055880531278, "grad_norm": 0.9334607902361824, "kl": 0.7265625, "learning_rate": 9.862561594254256e-07, "loss": 0.0, "reward": 2.2897677421569824, "reward_std": 0.14998523890972137, "rewards/accuracy_reward": 0.7116427421569824, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.421875, "step": 1087, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 517.890625, "epoch": 0.07487440644140114, "grad_norm": 0.35453111587464664, "kl": 0.68359375, "learning_rate": 9.862309768979105e-07, "loss": -0.0, "reward": 1.9781250953674316, "reward_std": 0.10643366724252701, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1088, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 511.890625, "epoch": 0.07494322482967448, "grad_norm": 1.0628344730438246, "kl": 0.6953125, "learning_rate": 9.862057716429114e-07, "loss": 0.0, "reward": 2.2526183128356934, "reward_std": 0.08031138777732849, "rewards/accuracy_reward": 0.6026184558868408, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1089, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 509.1875, "epoch": 0.07501204321794784, "grad_norm": 0.8281719936395361, "kl": 0.71875, "learning_rate": 9.861805436616064e-07, "loss": 0.0, "reward": 1.9553353786468506, "reward_std": 0.024694815278053284, "rewards/accuracy_reward": 0.7178353071212769, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0625, "step": 1090, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 532.5, "epoch": 0.07508086160622118, "grad_norm": 0.40930600009903656, "kl": 0.65234375, "learning_rate": 9.86155292955175e-07, "loss": -0.0, "reward": 2.5656251907348633, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1091, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 513.5, "epoch": 0.07514967999449453, "grad_norm": 0.0, "kl": 0.6796875, "learning_rate": 9.861300195247968e-07, "loss": 0.0, "reward": 2.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1092, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 523.828125, "epoch": 0.07521849838276788, "grad_norm": 0.5777879691752722, "kl": 0.69921875, "learning_rate": 9.861047233716535e-07, "loss": 0.0, "reward": 2.1624999046325684, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1093, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 539.375, "epoch": 0.07528731677104122, "grad_norm": 0.0, "kl": 0.65625, "learning_rate": 9.860794044969276e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1094, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 522.828125, "epoch": 0.07535613515931457, "grad_norm": 3.521351144644639, "kl": 0.6953125, "learning_rate": 9.860540629018025e-07, "loss": 0.0, "reward": 2.4565916061401367, "reward_std": 0.09476786851882935, "rewards/accuracy_reward": 0.7909666299819946, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1095, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 531.171875, "epoch": 0.07542495354758792, "grad_norm": 0.4024790280297841, "kl": 0.6875, "learning_rate": 9.860286985874628e-07, "loss": 0.0, "reward": 2.192187547683716, "reward_std": 0.11648450791835785, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4765625, "step": 1096, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 549.453125, "epoch": 0.07549377193586126, "grad_norm": 1.012017804274651, "kl": 0.6953125, "learning_rate": 9.86003311555094e-07, "loss": 0.0, "reward": 1.9577903747558594, "reward_std": 0.32023945450782776, "rewards/accuracy_reward": 0.7640403509140015, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0625, "step": 1097, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 551.03125, "epoch": 0.07556259032413461, "grad_norm": 0.6908341741006723, "kl": 0.66015625, "learning_rate": 9.859779018058826e-07, "loss": 0.0, "reward": 1.975000023841858, "reward_std": 0.08017837256193161, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1098, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 547.1875, "epoch": 0.07563140871240795, "grad_norm": 0.4081578081440415, "kl": 0.6796875, "learning_rate": 9.859524693410163e-07, "loss": 0.0, "reward": 1.8186044692993164, "reward_std": 0.004002016503363848, "rewards/accuracy_reward": 0.6686043739318848, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1099, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 534.84375, "epoch": 0.0757002271006813, "grad_norm": 0.7229438070290292, "kl": 0.73046875, "learning_rate": 9.85927014161684e-07, "loss": 0.0, "reward": 2.496875047683716, "reward_std": 0.09836839139461517, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.484375, "step": 1100, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 574.390625, "epoch": 0.07576904548895465, "grad_norm": 1.3890372491020309, "kl": 0.64453125, "learning_rate": 9.859015362690757e-07, "loss": 0.0, "reward": 1.8301985263824463, "reward_std": 0.2572598457336426, "rewards/accuracy_reward": 0.7645735144615173, "rewards/format_reward": 0.90625, "rewards/transform_reward": 0.0, "step": 1101, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 547.84375, "epoch": 0.07583786387722799, "grad_norm": 0.0, "kl": 0.63671875, "learning_rate": 9.85876035664382e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1102, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 577.0, "epoch": 0.07590668226550135, "grad_norm": 2.429495613906207, "kl": 0.6171875, "learning_rate": 9.85850512348795e-07, "loss": -0.0, "reward": 1.5931763648986816, "reward_std": 0.3007732629776001, "rewards/accuracy_reward": 0.5150513052940369, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.0, "step": 1103, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 548.328125, "epoch": 0.07597550065377469, "grad_norm": 0.39670808423868603, "kl": 0.65625, "learning_rate": 9.858249663235078e-07, "loss": 0.0, "reward": 1.9737269878387451, "reward_std": 0.008090194314718246, "rewards/accuracy_reward": 0.7987269759178162, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1104, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 529.71875, "epoch": 0.07604431904204803, "grad_norm": 1.0361175132596436, "kl": 0.69921875, "learning_rate": 9.857993975897143e-07, "loss": 0.0, "reward": 2.1468749046325684, "reward_std": 0.15026019513607025, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1105, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 519.671875, "epoch": 0.07611313743032139, "grad_norm": 3.664557525355728, "kl": 0.640625, "learning_rate": 9.857738061486096e-07, "loss": -0.0, "reward": 2.435513496398926, "reward_std": 0.24613146483898163, "rewards/accuracy_reward": 0.79020094871521, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1106, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 540.765625, "epoch": 0.07618195581859473, "grad_norm": 0.5093596389219889, "kl": 0.66796875, "learning_rate": 9.857481920013902e-07, "loss": 0.0, "reward": 2.106250047683716, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1107, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 523.703125, "epoch": 0.07625077420686807, "grad_norm": 3.2332062520814966, "kl": 0.62890625, "learning_rate": 9.85722555149253e-07, "loss": 0.0, "reward": 2.3390626907348633, "reward_std": 0.26233452558517456, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1108, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 536.390625, "epoch": 0.07631959259514143, "grad_norm": 0.7190025755342319, "kl": 0.65234375, "learning_rate": 9.856968955933963e-07, "loss": -0.0, "reward": 1.7793605327606201, "reward_std": 0.08287984132766724, "rewards/accuracy_reward": 0.6387354135513306, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1109, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 524.46875, "epoch": 0.07638841098341477, "grad_norm": 0.4722577583752255, "kl": 0.6953125, "learning_rate": 9.8567121333502e-07, "loss": -0.0, "reward": 2.4937500953674316, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1110, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 537.03125, "epoch": 0.07645722937168811, "grad_norm": 1.4286301572283826, "kl": 0.640625, "learning_rate": 9.856455083753242e-07, "loss": 0.0, "reward": 1.8644965887069702, "reward_std": 0.14085182547569275, "rewards/accuracy_reward": 0.7113715410232544, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1111, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 532.578125, "epoch": 0.07652604775996147, "grad_norm": 0.0, "kl": 0.6484375, "learning_rate": 9.8561978071551e-07, "loss": 0.0, "reward": 1.7916667461395264, "reward_std": 0.0, "rewards/accuracy_reward": 0.6416666507720947, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1112, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 497.5625, "epoch": 0.07659486614823481, "grad_norm": 0.393759784724365, "kl": 0.79296875, "learning_rate": 9.855940303567807e-07, "loss": 0.0, "reward": 2.4471466541290283, "reward_std": 0.018282894045114517, "rewards/accuracy_reward": 0.8346465826034546, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1113, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 547.671875, "epoch": 0.07666368453650815, "grad_norm": 0.0, "kl": 0.66015625, "learning_rate": 9.855682573003398e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1114, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 500.765625, "epoch": 0.0767325029247815, "grad_norm": 0.42324191065444244, "kl": 0.7265625, "learning_rate": 9.855424615473913e-07, "loss": -0.0, "reward": 2.5765976905822754, "reward_std": 0.009305906482040882, "rewards/accuracy_reward": 0.9390975832939148, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1115, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 540.359375, "epoch": 0.07680132131305485, "grad_norm": 1.4720155329366482, "kl": 0.62890625, "learning_rate": 9.855166430991417e-07, "loss": -0.0, "reward": 2.1047637462615967, "reward_std": 0.1143331378698349, "rewards/accuracy_reward": 0.48288869857788086, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1116, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 521.984375, "epoch": 0.0768701397013282, "grad_norm": 0.0, "kl": 0.6171875, "learning_rate": 9.854908019567975e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1117, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 514.84375, "epoch": 0.07693895808960154, "grad_norm": 0.9364305833903737, "kl": 0.66796875, "learning_rate": 9.854649381215668e-07, "loss": 0.0, "reward": 2.4156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1118, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 577.8125, "epoch": 0.07700777647787489, "grad_norm": 0.8113368251133966, "kl": 0.671875, "learning_rate": 9.854390515946582e-07, "loss": 0.0, "reward": 1.9178755283355713, "reward_std": 0.1614839732646942, "rewards/accuracy_reward": 0.6616255640983582, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.125, "step": 1119, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 553.40625, "epoch": 0.07707659486614823, "grad_norm": 0.7076261363996473, "kl": 0.64453125, "learning_rate": 9.854131423772818e-07, "loss": 0.0, "reward": 1.8888269662857056, "reward_std": 0.08098755031824112, "rewards/accuracy_reward": 0.732576847076416, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1120, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 554.609375, "epoch": 0.07714541325442158, "grad_norm": 0.0, "kl": 0.703125, "learning_rate": 9.853872104706487e-07, "loss": 0.0, "reward": 2.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1121, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 538.171875, "epoch": 0.07721423164269493, "grad_norm": 0.5799020766339631, "kl": 0.7265625, "learning_rate": 9.853612558759713e-07, "loss": 0.0, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1122, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 554.53125, "epoch": 0.07728305003096828, "grad_norm": 0.9916693499107512, "kl": 0.6328125, "learning_rate": 9.85335278594462e-07, "loss": 0.0, "reward": 2.543659210205078, "reward_std": 0.14836722612380981, "rewards/accuracy_reward": 0.8717842102050781, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 1123, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 518.53125, "epoch": 0.07735186841924162, "grad_norm": 0.67428360155191, "kl": 0.6796875, "learning_rate": 9.853092786273358e-07, "loss": -0.0, "reward": 2.512500047683716, "reward_std": 0.15526476502418518, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1124, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 573.515625, "epoch": 0.07742068680751497, "grad_norm": 0.8505819341864664, "kl": 0.67578125, "learning_rate": 9.852832559758078e-07, "loss": 0.0, "reward": 1.8164867162704468, "reward_std": 0.11536058783531189, "rewards/accuracy_reward": 0.6852365732192993, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1125, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 570.890625, "epoch": 0.07748950519578832, "grad_norm": 1.2087461497789826, "kl": 0.69140625, "learning_rate": 9.852572106410942e-07, "loss": -0.0, "reward": 1.6156460046768188, "reward_std": 0.2715167999267578, "rewards/accuracy_reward": 0.5468959808349609, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.0, "step": 1126, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 534.03125, "epoch": 0.07755832358406166, "grad_norm": 0.6299289621964643, "kl": 0.64453125, "learning_rate": 9.852311426244124e-07, "loss": 0.0, "reward": 1.8611981868743896, "reward_std": 0.0731101781129837, "rewards/accuracy_reward": 0.7049481868743896, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1127, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 562.3125, "epoch": 0.07762714197233501, "grad_norm": 0.6888978962212465, "kl": 0.609375, "learning_rate": 9.85205051926981e-07, "loss": 0.0, "reward": 2.005484104156494, "reward_std": 0.20380638539791107, "rewards/accuracy_reward": 0.8679841160774231, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.0, "step": 1128, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 577.4375, "epoch": 0.07769596036060836, "grad_norm": 2.3974951731268006, "kl": 0.66796875, "learning_rate": 9.851789385500195e-07, "loss": 0.0, "reward": 2.5421876907348633, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 1129, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 588.03125, "epoch": 0.0777647787488817, "grad_norm": 0.0, "kl": 0.65625, "learning_rate": 9.851528024947483e-07, "loss": 0.0, "reward": 2.1125001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0625, "step": 1130, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 600.453125, "epoch": 0.07783359713715504, "grad_norm": 2.2347733179072455, "kl": 0.58203125, "learning_rate": 9.851266437623895e-07, "loss": 0.0, "reward": 2.2835307121276855, "reward_std": 0.19905945658683777, "rewards/accuracy_reward": 0.6725931763648987, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.4921875, "step": 1131, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 578.515625, "epoch": 0.0779024155254284, "grad_norm": 0.6494161362714207, "kl": 0.671875, "learning_rate": 9.851004623541655e-07, "loss": -0.0, "reward": 2.2543678283691406, "reward_std": 0.09653972089290619, "rewards/accuracy_reward": 0.6168677806854248, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1132, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 572.40625, "epoch": 0.07797123391370174, "grad_norm": 0.6636048396076485, "kl": 0.62109375, "learning_rate": 9.850742582713001e-07, "loss": 0.0, "reward": 1.7852774858474731, "reward_std": 0.11210712045431137, "rewards/accuracy_reward": 0.6540274620056152, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1133, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 555.96875, "epoch": 0.07804005230197508, "grad_norm": 0.5244066444718959, "kl": 0.625, "learning_rate": 9.850480315150182e-07, "loss": 0.0, "reward": 1.9156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1134, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 581.984375, "epoch": 0.07810887069024844, "grad_norm": 1.4461784520436836, "kl": 0.578125, "learning_rate": 9.850217820865457e-07, "loss": -0.0, "reward": 2.273585557937622, "reward_std": 0.16777965426445007, "rewards/accuracy_reward": 0.65483558177948, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.5, "step": 1135, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 569.640625, "epoch": 0.07817768907852178, "grad_norm": 0.7820385171118378, "kl": 0.640625, "learning_rate": 9.849955099871095e-07, "loss": 0.0, "reward": 1.9071946144104004, "reward_std": 0.21794912219047546, "rewards/accuracy_reward": 0.7650070190429688, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.0234375, "step": 1136, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 552.9375, "epoch": 0.07824650746679512, "grad_norm": 1.0648929635810858, "kl": 0.62890625, "learning_rate": 9.849692152179377e-07, "loss": 0.0, "reward": 2.052428960800171, "reward_std": 0.19017745554447174, "rewards/accuracy_reward": 0.8836790323257446, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1137, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 536.390625, "epoch": 0.07831532585506848, "grad_norm": 1.6534661925134313, "kl": 0.64453125, "learning_rate": 9.849428977802592e-07, "loss": 0.0, "reward": 2.3968749046325684, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1138, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 553.8125, "epoch": 0.07838414424334182, "grad_norm": 0.6225428952711424, "kl": 0.640625, "learning_rate": 9.849165576753043e-07, "loss": 0.0, "reward": 1.975000023841858, "reward_std": 0.1306653916835785, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1139, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 534.140625, "epoch": 0.07845296263161516, "grad_norm": 0.4696745412476235, "kl": 0.66796875, "learning_rate": 9.848901949043041e-07, "loss": 0.0, "reward": 1.8305712938308716, "reward_std": 0.00974156241863966, "rewards/accuracy_reward": 0.6805711984634399, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1140, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 543.4375, "epoch": 0.07852178101988852, "grad_norm": 0.0, "kl": 0.64453125, "learning_rate": 9.84863809468491e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1141, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 542.859375, "epoch": 0.07859059940816186, "grad_norm": 0.9688789029874049, "kl": 0.63671875, "learning_rate": 9.84837401369098e-07, "loss": -0.0, "reward": 1.7840540409088135, "reward_std": 0.17755641043186188, "rewards/accuracy_reward": 0.6465539932250977, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 1142, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 518.78125, "epoch": 0.0786594177964352, "grad_norm": 1.4016359521126487, "kl": 0.6484375, "learning_rate": 9.848109706073597e-07, "loss": -0.0, "reward": 1.7992337942123413, "reward_std": 0.14753681421279907, "rewards/accuracy_reward": 0.6617338061332703, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1143, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 534.40625, "epoch": 0.07872823618470856, "grad_norm": 1.325439398017849, "kl": 0.63671875, "learning_rate": 9.847845171845117e-07, "loss": -0.0, "reward": 1.8632032871246338, "reward_std": 0.1955110728740692, "rewards/accuracy_reward": 0.706953227519989, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1144, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 525.515625, "epoch": 0.0787970545729819, "grad_norm": 0.9472790526454636, "kl": 0.6875, "learning_rate": 9.8475804110179e-07, "loss": 0.0, "reward": 2.190624952316284, "reward_std": 0.25725260376930237, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4375, "step": 1145, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 492.21875, "epoch": 0.07886587296125525, "grad_norm": 0.5267807563909097, "kl": 0.69921875, "learning_rate": 9.847315423604328e-07, "loss": 0.0, "reward": 1.8625000715255737, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1146, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 509.765625, "epoch": 0.07893469134952859, "grad_norm": 1.39393101222445, "kl": 0.69921875, "learning_rate": 9.84705020961678e-07, "loss": -0.0, "reward": 2.3296022415161133, "reward_std": 0.06968475878238678, "rewards/accuracy_reward": 0.6733521819114685, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 1147, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 517.96875, "epoch": 0.07900350973780194, "grad_norm": 0.7657987559194797, "kl": 0.66015625, "learning_rate": 9.846784769067658e-07, "loss": 0.0, "reward": 2.3306283950805664, "reward_std": 0.03547457605600357, "rewards/accuracy_reward": 0.7025034427642822, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.453125, "step": 1148, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 500.796875, "epoch": 0.07907232812607529, "grad_norm": 1.5511275766725197, "kl": 0.66796875, "learning_rate": 9.846519101969368e-07, "loss": 0.0, "reward": 2.362499952316284, "reward_std": 0.1060660257935524, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1149, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 488.5625, "epoch": 0.07914114651434863, "grad_norm": 0.8530416973660929, "kl": 0.70703125, "learning_rate": 9.846253208334328e-07, "loss": 0.0, "reward": 2.090625047683716, "reward_std": 0.17740556597709656, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1150, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 483.0625, "epoch": 0.07920996490262198, "grad_norm": 0.5446669064008307, "kl": 0.66015625, "learning_rate": 9.845987088174963e-07, "loss": 0.0, "reward": 2.18412709236145, "reward_std": 0.028129801154136658, "rewards/accuracy_reward": 0.5591270923614502, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1151, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 505.390625, "epoch": 0.07927878329089533, "grad_norm": 0.0, "kl": 0.69140625, "learning_rate": 9.845720741503716e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1152, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 509.21875, "epoch": 0.07934760167916867, "grad_norm": 0.0, "kl": 0.66796875, "learning_rate": 9.845454168333037e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1153, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 483.78125, "epoch": 0.07941642006744203, "grad_norm": 0.5126825286872336, "kl": 0.72265625, "learning_rate": 9.845187368675382e-07, "loss": 0.0, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1154, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 478.78125, "epoch": 0.07948523845571537, "grad_norm": 1.0078884423891117, "kl": 0.6953125, "learning_rate": 9.844920342543227e-07, "loss": -0.0, "reward": 2.362499952316284, "reward_std": 0.2354431450366974, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1155, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 487.3125, "epoch": 0.07955405684398871, "grad_norm": 0.0, "kl": 0.703125, "learning_rate": 9.844653089949048e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1156, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 517.84375, "epoch": 0.07962287523226207, "grad_norm": 0.3738963148160083, "kl": 0.67578125, "learning_rate": 9.844385610905343e-07, "loss": 0.0, "reward": 2.2830448150634766, "reward_std": 0.06395433843135834, "rewards/accuracy_reward": 0.6517947912216187, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 1157, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 448.34375, "epoch": 0.07969169362053541, "grad_norm": 0.6090186614867028, "kl": 0.78515625, "learning_rate": 9.84411790542461e-07, "loss": 0.0, "reward": 2.512500047683716, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1158, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 451.65625, "epoch": 0.07976051200880875, "grad_norm": 0.0, "kl": 0.76953125, "learning_rate": 9.843849973519362e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1159, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 508.34375, "epoch": 0.07982933039708211, "grad_norm": 0.4981277063264785, "kl": 0.65625, "learning_rate": 9.843581815202124e-07, "loss": 0.0, "reward": 2.3283658027648926, "reward_std": 0.006235738750547171, "rewards/accuracy_reward": 0.6783655881881714, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1160, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 513.609375, "epoch": 0.07989814878535545, "grad_norm": 0.7018158760514474, "kl": 0.671875, "learning_rate": 9.843313430485432e-07, "loss": 0.0, "reward": 2.423194408416748, "reward_std": 0.07197118550539017, "rewards/accuracy_reward": 0.7544442415237427, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1161, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 464.109375, "epoch": 0.07996696717362879, "grad_norm": 0.8391825453741716, "kl": 0.76953125, "learning_rate": 9.84304481938183e-07, "loss": -0.0, "reward": 2.078125, "reward_std": 0.11363068222999573, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.125, "step": 1162, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 487.953125, "epoch": 0.08003578556190213, "grad_norm": 2.106562928501752, "kl": 0.67578125, "learning_rate": 9.84277598190387e-07, "loss": -0.0, "reward": 2.510652780532837, "reward_std": 0.14260552823543549, "rewards/accuracy_reward": 0.8294028639793396, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1163, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 477.21875, "epoch": 0.08010460395017549, "grad_norm": 0.0, "kl": 0.6796875, "learning_rate": 9.842506918064122e-07, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1164, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 489.6875, "epoch": 0.08017342233844883, "grad_norm": 2.0866978246489407, "kl": 0.71484375, "learning_rate": 9.842237627875162e-07, "loss": -0.0, "reward": 2.345754623413086, "reward_std": 0.03416513279080391, "rewards/accuracy_reward": 0.7332543134689331, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1165, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 496.5, "epoch": 0.08024224072672217, "grad_norm": 0.438763966814846, "kl": 0.66796875, "learning_rate": 9.841968111349578e-07, "loss": -0.0, "reward": 1.6722283363342285, "reward_std": 0.0014136452227830887, "rewards/accuracy_reward": 0.547228217124939, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1166, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 482.765625, "epoch": 0.08031105911499553, "grad_norm": 0.7612097628419774, "kl": 0.70703125, "learning_rate": 9.841698368499964e-07, "loss": 0.0, "reward": 1.8081738948822021, "reward_std": 0.010041230358183384, "rewards/accuracy_reward": 0.6581738591194153, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1167, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 450.15625, "epoch": 0.08037987750326887, "grad_norm": 1.095047267337603, "kl": 0.80078125, "learning_rate": 9.84142839933893e-07, "loss": 0.0, "reward": 2.218193531036377, "reward_std": 0.09472055733203888, "rewards/accuracy_reward": 0.5775684118270874, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1168, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 503.703125, "epoch": 0.08044869589154222, "grad_norm": 0.4383540400763314, "kl": 0.70703125, "learning_rate": 9.841158203879098e-07, "loss": -0.0, "reward": 2.3171706199645996, "reward_std": 0.0018183407373726368, "rewards/accuracy_reward": 0.6671704053878784, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1169, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 489.265625, "epoch": 0.08051751427981557, "grad_norm": 0.38556088066246297, "kl": 0.734375, "learning_rate": 9.840887782133096e-07, "loss": -0.0, "reward": 2.1666934490203857, "reward_std": 0.026019180193543434, "rewards/accuracy_reward": 0.8041932582855225, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1875, "step": 1170, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 514.09375, "epoch": 0.08058633266808891, "grad_norm": 0.0, "kl": 0.671875, "learning_rate": 9.840617134113562e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1171, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 533.359375, "epoch": 0.08065515105636226, "grad_norm": 0.428538695551311, "kl": 0.69140625, "learning_rate": 9.840346259833148e-07, "loss": -0.0, "reward": 2.4937500953674316, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1172, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 526.453125, "epoch": 0.08072396944463561, "grad_norm": 0.0, "kl": 0.68359375, "learning_rate": 9.840075159304516e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1173, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 539.984375, "epoch": 0.08079278783290895, "grad_norm": 0.8074355608992038, "kl": 0.65625, "learning_rate": 9.839803832540336e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1174, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 546.359375, "epoch": 0.0808616062211823, "grad_norm": 0.0, "kl": 0.68359375, "learning_rate": 9.839532279553292e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1175, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 551.140625, "epoch": 0.08093042460945565, "grad_norm": 0.7200595170319758, "kl": 0.69140625, "learning_rate": 9.839260500356076e-07, "loss": 0.0, "reward": 2.5234375, "reward_std": 0.16351844370365143, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1176, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 592.3125, "epoch": 0.080999242997729, "grad_norm": 0.7194298373510833, "kl": 0.6015625, "learning_rate": 9.838988494961391e-07, "loss": -0.0, "reward": 1.938644289970398, "reward_std": 0.2636149823665619, "rewards/accuracy_reward": 0.7948943376541138, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.0, "step": 1177, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 561.796875, "epoch": 0.08106806138600234, "grad_norm": 0.8231158854922321, "kl": 0.6484375, "learning_rate": 9.838716263381955e-07, "loss": -0.0, "reward": 2.3462536334991455, "reward_std": 0.2528087794780731, "rewards/accuracy_reward": 0.7196912169456482, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1178, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 561.625, "epoch": 0.08113687977427568, "grad_norm": 0.6574408423288586, "kl": 0.68359375, "learning_rate": 9.838443805630487e-07, "loss": -0.0, "reward": 1.712499976158142, "reward_std": 0.14961488544940948, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1179, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 590.6875, "epoch": 0.08120569816254904, "grad_norm": 0.4212051100664526, "kl": 0.59765625, "learning_rate": 9.838171121719727e-07, "loss": 0.0, "reward": 2.6234374046325684, "reward_std": 0.14274689555168152, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.4921875, "step": 1180, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 571.140625, "epoch": 0.08127451655082238, "grad_norm": 1.460899723789344, "kl": 0.7109375, "learning_rate": 9.83789821166242e-07, "loss": -0.0, "reward": 1.9122685194015503, "reward_std": 0.06409884989261627, "rewards/accuracy_reward": 0.6607059836387634, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1015625, "step": 1181, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 594.0625, "epoch": 0.08134333493909572, "grad_norm": 0.8686494306833642, "kl": 0.640625, "learning_rate": 9.837625075471317e-07, "loss": 0.0, "reward": 2.5546875, "reward_std": 0.41100579500198364, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.4765625, "step": 1182, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 604.890625, "epoch": 0.08141215332736908, "grad_norm": 0.7546741068449904, "kl": 0.625, "learning_rate": 9.837351713159191e-07, "loss": -0.0, "reward": 2.3712313175201416, "reward_std": 0.14941248297691345, "rewards/accuracy_reward": 0.7274812459945679, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 1183, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 571.890625, "epoch": 0.08148097171564242, "grad_norm": 0.5758292458515069, "kl": 0.6328125, "learning_rate": 9.83707812473882e-07, "loss": 0.0, "reward": 2.489062547683716, "reward_std": 0.2607456147670746, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.4921875, "step": 1184, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 635.515625, "epoch": 0.08154979010391576, "grad_norm": 0.7354564238744032, "kl": 0.59765625, "learning_rate": 9.836804310222988e-07, "loss": 0.0, "reward": 1.9803681373596191, "reward_std": 0.45941245555877686, "rewards/accuracy_reward": 0.5225555896759033, "rewards/format_reward": 0.859375, "rewards/transform_reward": 0.4765625, "step": 1185, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 566.484375, "epoch": 0.08161860849218912, "grad_norm": 0.6479849032961297, "kl": 0.640625, "learning_rate": 9.836530269624497e-07, "loss": 0.0, "reward": 2.512500047683716, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1186, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 572.28125, "epoch": 0.08168742688046246, "grad_norm": 0.7256962644910095, "kl": 0.65625, "learning_rate": 9.836256002956155e-07, "loss": 0.0, "reward": 2.0374999046325684, "reward_std": 0.20493263006210327, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 1187, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 556.015625, "epoch": 0.0817562452687358, "grad_norm": 0.8740503991418008, "kl": 0.6484375, "learning_rate": 9.83598151023078e-07, "loss": 0.0, "reward": 2.4890623092651367, "reward_std": 0.17235727608203888, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1188, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 548.140625, "epoch": 0.08182506365700916, "grad_norm": 0.6779180527696327, "kl": 0.6640625, "learning_rate": 9.835706791461204e-07, "loss": -0.0, "reward": 2.31276273727417, "reward_std": 0.1760682910680771, "rewards/accuracy_reward": 0.6893253326416016, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1189, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 539.21875, "epoch": 0.0818938820452825, "grad_norm": 0.4122259994023471, "kl": 0.6640625, "learning_rate": 9.83543184666027e-07, "loss": 0.0, "reward": 2.1548476219177246, "reward_std": 0.009336205199360847, "rewards/accuracy_reward": 0.9548474550247192, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1190, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 545.609375, "epoch": 0.08196270043355584, "grad_norm": 0.5526929090209567, "kl": 0.671875, "learning_rate": 9.835156675840825e-07, "loss": -0.0, "reward": 1.8082338571548462, "reward_std": 0.05776429548859596, "rewards/accuracy_reward": 0.6738588213920593, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1191, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 523.578125, "epoch": 0.0820315188218292, "grad_norm": 0.6154108919149085, "kl": 0.64453125, "learning_rate": 9.834881279015736e-07, "loss": 0.0, "reward": 2.053895950317383, "reward_std": 0.0861605629324913, "rewards/accuracy_reward": 0.869520902633667, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1192, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 493.078125, "epoch": 0.08210033721010254, "grad_norm": 0.7212399949207514, "kl": 0.76953125, "learning_rate": 9.834605656197871e-07, "loss": 0.0, "reward": 2.3125, "reward_std": 0.15587717294692993, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.46875, "step": 1193, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 523.875, "epoch": 0.08216915559837588, "grad_norm": 0.0, "kl": 0.68359375, "learning_rate": 9.834329807400118e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1194, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 514.640625, "epoch": 0.08223797398664923, "grad_norm": 0.0, "kl": 0.7109375, "learning_rate": 9.834053732635366e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1195, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 515.484375, "epoch": 0.08230679237492258, "grad_norm": 0.5797733482407601, "kl": 0.640625, "learning_rate": 9.833777431916522e-07, "loss": -0.0, "reward": 1.7818560600280762, "reward_std": 0.081697478890419, "rewards/accuracy_reward": 0.6412309408187866, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1196, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 503.046875, "epoch": 0.08237561076319593, "grad_norm": 1.0253566640306435, "kl": 0.65234375, "learning_rate": 9.833500905256502e-07, "loss": 0.0, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1197, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 498.140625, "epoch": 0.08244442915146927, "grad_norm": 0.0, "kl": 0.671875, "learning_rate": 9.833224152668229e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1198, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 506.625, "epoch": 0.08251324753974262, "grad_norm": 1.0945589675823364, "kl": 0.6953125, "learning_rate": 9.832947174164638e-07, "loss": -0.0, "reward": 2.036288261413574, "reward_std": 0.0814225971698761, "rewards/accuracy_reward": 0.8519130349159241, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1199, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 494.6875, "epoch": 0.08258206592801597, "grad_norm": 0.7612263629687398, "kl": 0.75390625, "learning_rate": 9.83266996975868e-07, "loss": 0.0, "reward": 2.0321168899536133, "reward_std": 0.07955861836671829, "rewards/accuracy_reward": 0.8508667945861816, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1200, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 488.046875, "epoch": 0.08265088431628931, "grad_norm": 0.0, "kl": 0.7578125, "learning_rate": 9.83239253946331e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1201, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 503.859375, "epoch": 0.08271970270456266, "grad_norm": 0.5921023631883993, "kl": 0.74609375, "learning_rate": 9.832114883291493e-07, "loss": -0.0, "reward": 2.03125, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1202, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 480.40625, "epoch": 0.082788521092836, "grad_norm": 2.2327854027122136, "kl": 0.671875, "learning_rate": 9.83183700125621e-07, "loss": -0.0, "reward": 1.9506679773330688, "reward_std": 0.060039419680833817, "rewards/accuracy_reward": 0.7787929773330688, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1203, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 498.09375, "epoch": 0.08285733948110935, "grad_norm": 0.49690836491573703, "kl": 0.6640625, "learning_rate": 9.831558893370451e-07, "loss": 0.0, "reward": 1.9437501430511475, "reward_std": 0.20366084575653076, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 1204, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 519.796875, "epoch": 0.0829261578693827, "grad_norm": 1.1939645021379959, "kl": 0.6953125, "learning_rate": 9.831280559647212e-07, "loss": 0.0, "reward": 2.0152957439422607, "reward_std": 0.154160737991333, "rewards/accuracy_reward": 0.8340456485748291, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1205, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 503.640625, "epoch": 0.08299497625765605, "grad_norm": 0.0, "kl": 0.64453125, "learning_rate": 9.831002000099506e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1206, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 509.328125, "epoch": 0.08306379464592939, "grad_norm": 0.9201962789922706, "kl": 0.7265625, "learning_rate": 9.83072321474035e-07, "loss": -0.0, "reward": 1.9903569221496582, "reward_std": 0.046569909900426865, "rewards/accuracy_reward": 0.8153569102287292, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1207, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 464.15625, "epoch": 0.08313261303420275, "grad_norm": 5.1315413463600725, "kl": 0.6953125, "learning_rate": 9.83044420358278e-07, "loss": -0.0, "reward": 1.9375, "reward_std": 0.22470125555992126, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1208, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 476.890625, "epoch": 0.08320143142247609, "grad_norm": 1.7812771946241022, "kl": 0.71875, "learning_rate": 9.830164966639833e-07, "loss": 0.0, "reward": 2.5968596935272217, "reward_std": 0.05623073875904083, "rewards/accuracy_reward": 0.8999847173690796, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1209, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 463.1875, "epoch": 0.08327024981074943, "grad_norm": 0.0, "kl": 0.6953125, "learning_rate": 9.829885503924563e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1210, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 472.21875, "epoch": 0.08333906819902277, "grad_norm": 0.7988522375867402, "kl": 0.76953125, "learning_rate": 9.829605815450032e-07, "loss": 0.0, "reward": 2.213362216949463, "reward_std": 0.07309886813163757, "rewards/accuracy_reward": 0.582112193107605, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1211, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 460.171875, "epoch": 0.08340788658729613, "grad_norm": 0.0, "kl": 0.69140625, "learning_rate": 9.829325901229316e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1212, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 468.75, "epoch": 0.08347670497556947, "grad_norm": 0.43426338528637654, "kl": 0.765625, "learning_rate": 9.829045761275494e-07, "loss": 0.0, "reward": 2.3279218673706055, "reward_std": 0.0038473664317280054, "rewards/accuracy_reward": 0.6779218912124634, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1213, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 477.109375, "epoch": 0.08354552336384281, "grad_norm": 0.7892451879249391, "kl": 0.7734375, "learning_rate": 9.828765395601665e-07, "loss": -0.0, "reward": 2.254687547683716, "reward_std": 0.10227546840906143, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4296875, "step": 1214, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 479.453125, "epoch": 0.08361434175211617, "grad_norm": 1.0494757648726367, "kl": 0.66015625, "learning_rate": 9.82848480422093e-07, "loss": 0.0, "reward": 2.4739022254943848, "reward_std": 0.0030178020242601633, "rewards/accuracy_reward": 0.798902153968811, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1215, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 484.90625, "epoch": 0.08368316014038951, "grad_norm": 0.7755444959171452, "kl": 0.65625, "learning_rate": 9.828203987146407e-07, "loss": 0.0, "reward": 2.4622044563293457, "reward_std": 0.008090460672974586, "rewards/accuracy_reward": 0.7622044086456299, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1216, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 465.984375, "epoch": 0.08375197852866285, "grad_norm": 0.7281721302654393, "kl": 0.76953125, "learning_rate": 9.827922944391221e-07, "loss": 0.0, "reward": 2.527557849884033, "reward_std": 0.004810805432498455, "rewards/accuracy_reward": 0.8275576829910278, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1217, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 465.046875, "epoch": 0.08382079691693621, "grad_norm": 2.2845065657981722, "kl": 0.8046875, "learning_rate": 9.82764167596851e-07, "loss": -0.0, "reward": 2.2437500953674316, "reward_std": 0.09810129553079605, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.34375, "step": 1218, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 453.34375, "epoch": 0.08388961530520955, "grad_norm": 1.6427893086572405, "kl": 0.7265625, "learning_rate": 9.82736018189142e-07, "loss": 0.0, "reward": 2.402273178100586, "reward_std": 0.018794002011418343, "rewards/accuracy_reward": 0.7335231900215149, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1219, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 433.328125, "epoch": 0.0839584336934829, "grad_norm": 1.8882752682368726, "kl": 0.7265625, "learning_rate": 9.827078462173105e-07, "loss": 0.0, "reward": 2.1379165649414062, "reward_std": 0.003328954102471471, "rewards/accuracy_reward": 0.93791663646698, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1220, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 459.0625, "epoch": 0.08402725208175625, "grad_norm": 0.0, "kl": 0.66015625, "learning_rate": 9.826796516826742e-07, "loss": 0.0, "reward": 2.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1221, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 445.578125, "epoch": 0.0840960704700296, "grad_norm": 1.0553104910728242, "kl": 0.75390625, "learning_rate": 9.8265143458655e-07, "loss": 0.0, "reward": 1.840309739112854, "reward_std": 0.15905751287937164, "rewards/accuracy_reward": 0.6934346556663513, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1222, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 443.609375, "epoch": 0.08416488885830294, "grad_norm": 1.8023106732561218, "kl": 0.7578125, "learning_rate": 9.826231949302575e-07, "loss": 0.0, "reward": 2.4406251907348633, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.390625, "step": 1223, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 423.03125, "epoch": 0.08423370724657629, "grad_norm": 0.0, "kl": 0.765625, "learning_rate": 9.825949327151164e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1224, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.4375, "epoch": 0.08430252563484963, "grad_norm": 0.9822407832102981, "kl": 0.77734375, "learning_rate": 9.825666479424478e-07, "loss": 0.0, "reward": 2.5062499046325684, "reward_std": 0.08711418509483337, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1225, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 442.078125, "epoch": 0.08437134402312298, "grad_norm": 0.9899623919852539, "kl": 0.6875, "learning_rate": 9.825383406135738e-07, "loss": -0.0, "reward": 2.51381254196167, "reward_std": 0.1866403967142105, "rewards/accuracy_reward": 0.8434999585151672, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1226, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 419.25, "epoch": 0.08444016241139632, "grad_norm": 2.959689066739821, "kl": 0.78125, "learning_rate": 9.825100107298174e-07, "loss": -0.0, "reward": 2.4375, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1227, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 436.265625, "epoch": 0.08450898079966968, "grad_norm": 0.7650517899084164, "kl": 0.80078125, "learning_rate": 9.82481658292503e-07, "loss": -0.0, "reward": 2.0875000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1228, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 412.765625, "epoch": 0.08457779918794302, "grad_norm": 1.2781411402657838, "kl": 0.73828125, "learning_rate": 9.82453283302956e-07, "loss": 0.0, "reward": 2.453125, "reward_std": 0.14205022156238556, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1229, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 407.21875, "epoch": 0.08464661757621636, "grad_norm": 0.568780546444329, "kl": 0.73828125, "learning_rate": 9.824248857625023e-07, "loss": -0.0, "reward": 1.9514610767364502, "reward_std": 0.008601678535342216, "rewards/accuracy_reward": 0.7795859575271606, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1230, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 424.078125, "epoch": 0.08471543596448972, "grad_norm": 1.063996148413549, "kl": 0.7109375, "learning_rate": 9.823964656724695e-07, "loss": 0.0, "reward": 2.4220714569091797, "reward_std": 0.0822649672627449, "rewards/accuracy_reward": 0.7564465403556824, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1231, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 401.203125, "epoch": 0.08478425435276306, "grad_norm": 1.0567117119144696, "kl": 0.69140625, "learning_rate": 9.823680230341859e-07, "loss": 0.0, "reward": 2.450000047683716, "reward_std": 0.12246951460838318, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1232, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 402.453125, "epoch": 0.0848530727410364, "grad_norm": 1.0852669749922284, "kl": 0.6328125, "learning_rate": 9.82339557848981e-07, "loss": 0.0, "reward": 1.7433501482009888, "reward_std": 0.03677576035261154, "rewards/accuracy_reward": 0.5964751243591309, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1233, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 440.90625, "epoch": 0.08492189112930976, "grad_norm": 1.2166711104213752, "kl": 0.6640625, "learning_rate": 9.823110701181855e-07, "loss": -0.0, "reward": 1.6732765436172485, "reward_std": 0.09248915314674377, "rewards/accuracy_reward": 0.5639015436172485, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1234, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 393.84375, "epoch": 0.0849907095175831, "grad_norm": 2.9101502192213435, "kl": 0.67578125, "learning_rate": 9.822825598431307e-07, "loss": 0.0, "reward": 2.1937499046325684, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1235, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 459.5625, "epoch": 0.08505952790585644, "grad_norm": 0.885043700515502, "kl": 0.68359375, "learning_rate": 9.822540270251496e-07, "loss": -0.0, "reward": 2.3356833457946777, "reward_std": 0.08259273320436478, "rewards/accuracy_reward": 0.7356831431388855, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1236, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 418.984375, "epoch": 0.0851283462941298, "grad_norm": 0.0, "kl": 0.6796875, "learning_rate": 9.822254716655755e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1237, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 412.40625, "epoch": 0.08519716468240314, "grad_norm": 1.421824757100901, "kl": 0.68359375, "learning_rate": 9.821968937657434e-07, "loss": -0.0, "reward": 2.606250047683716, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1238, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 452.859375, "epoch": 0.08526598307067648, "grad_norm": 1.0254682007212772, "kl": 0.72265625, "learning_rate": 9.821682933269888e-07, "loss": -0.0, "reward": 1.992197036743164, "reward_std": 0.12432616949081421, "rewards/accuracy_reward": 0.8140719532966614, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1239, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 424.453125, "epoch": 0.08533480145894984, "grad_norm": 0.4120352218844968, "kl": 0.63671875, "learning_rate": 9.821396703506488e-07, "loss": -0.0, "reward": 1.9687504768371582, "reward_std": 0.0039061589632183313, "rewards/accuracy_reward": 0.7937504053115845, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1240, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 419.4375, "epoch": 0.08540361984722318, "grad_norm": 0.0, "kl": 0.65234375, "learning_rate": 9.821110248380614e-07, "loss": 0.0, "reward": 2.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1241, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 428.84375, "epoch": 0.08547243823549652, "grad_norm": 3.3583024997942457, "kl": 0.8125, "learning_rate": 9.820823567905653e-07, "loss": -0.0, "reward": 2.3906829357147217, "reward_std": 0.13900141417980194, "rewards/accuracy_reward": 0.7453703880310059, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1242, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 452.953125, "epoch": 0.08554125662376987, "grad_norm": 0.7249226886571543, "kl": 0.66796875, "learning_rate": 9.820536662095004e-07, "loss": 0.0, "reward": 1.8199853897094727, "reward_std": 0.0484125092625618, "rewards/accuracy_reward": 0.6856104135513306, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1243, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 452.234375, "epoch": 0.08561007501204322, "grad_norm": 0.0, "kl": 0.6875, "learning_rate": 9.820249530962084e-07, "loss": 0.0, "reward": 2.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1244, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 459.0625, "epoch": 0.08567889340031656, "grad_norm": 0.945669977394186, "kl": 0.78125, "learning_rate": 9.819962174520306e-07, "loss": 0.0, "reward": 2.653125047683716, "reward_std": 0.05444391071796417, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.453125, "step": 1245, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 450.8125, "epoch": 0.0857477117885899, "grad_norm": 1.1107629821771507, "kl": 0.640625, "learning_rate": 9.819674592783108e-07, "loss": -0.0, "reward": 1.9093998670578003, "reward_std": 0.0350569486618042, "rewards/accuracy_reward": 0.7093998193740845, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1246, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 466.046875, "epoch": 0.08581653017686326, "grad_norm": 0.5306196406135116, "kl": 0.7421875, "learning_rate": 9.819386785763926e-07, "loss": -0.0, "reward": 2.58162784576416, "reward_std": 0.013279234990477562, "rewards/accuracy_reward": 0.8878779411315918, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1247, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.15625, "epoch": 0.0858853485651366, "grad_norm": 0.0, "kl": 0.65625, "learning_rate": 9.81909875347622e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1248, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 464.84375, "epoch": 0.08595416695340995, "grad_norm": 0.8471573638978259, "kl": 0.67578125, "learning_rate": 9.818810495933445e-07, "loss": -0.0, "reward": 2.512500047683716, "reward_std": 0.1496148705482483, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1249, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 502.0625, "epoch": 0.0860229853416833, "grad_norm": 1.1152682566347027, "kl": 0.6953125, "learning_rate": 9.818522013149082e-07, "loss": 0.0, "reward": 2.5421876907348633, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 1250, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 440.078125, "epoch": 0.08609180372995665, "grad_norm": 0.9353684507680723, "kl": 0.640625, "learning_rate": 9.818233305136613e-07, "loss": 0.0, "reward": 2.031250238418579, "reward_std": 0.22724726796150208, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1251, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 433.34375, "epoch": 0.08616062211822999, "grad_norm": 0.4227030783419966, "kl": 0.65625, "learning_rate": 9.817944371909533e-07, "loss": 0.0, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1252, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 530.359375, "epoch": 0.08622944050650334, "grad_norm": 0.9414238097253804, "kl": 0.609375, "learning_rate": 9.817655213481346e-07, "loss": -0.0, "reward": 2.060087203979492, "reward_std": 0.00808015838265419, "rewards/accuracy_reward": 0.8600870370864868, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1253, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 439.734375, "epoch": 0.08629825889477669, "grad_norm": 0.5242171091332931, "kl": 0.65625, "learning_rate": 9.817365829865568e-07, "loss": -0.0, "reward": 1.9522284269332886, "reward_std": 0.0014605375472456217, "rewards/accuracy_reward": 0.7772283554077148, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1254, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 479.609375, "epoch": 0.08636707728305003, "grad_norm": 0.0, "kl": 0.65234375, "learning_rate": 9.817076221075727e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1255, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 443.734375, "epoch": 0.08643589567132338, "grad_norm": 0.0, "kl": 0.62109375, "learning_rate": 9.816786387125358e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1256, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 447.34375, "epoch": 0.08650471405959673, "grad_norm": 0.0, "kl": 0.609375, "learning_rate": 9.816496328028012e-07, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1257, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 430.703125, "epoch": 0.08657353244787007, "grad_norm": 0.8257708431186414, "kl": 0.671875, "learning_rate": 9.81620604379724e-07, "loss": 0.0, "reward": 1.84375, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1258, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 492.03125, "epoch": 0.08664235083614341, "grad_norm": 0.0, "kl": 0.6953125, "learning_rate": 9.81591553444662e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1259, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 542.59375, "epoch": 0.08671116922441677, "grad_norm": 0.7648001808140503, "kl": 0.6328125, "learning_rate": 9.815624799989724e-07, "loss": -0.0, "reward": 2.624176502227783, "reward_std": 0.00535796582698822, "rewards/accuracy_reward": 0.9241766929626465, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1260, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 484.984375, "epoch": 0.08677998761269011, "grad_norm": 0.0, "kl": 0.6328125, "learning_rate": 9.815333840440144e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1261, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 486.96875, "epoch": 0.08684880600096345, "grad_norm": 0.39022248398260123, "kl": 0.6484375, "learning_rate": 9.815042655811478e-07, "loss": 0.0, "reward": 2.125316858291626, "reward_std": 0.003312629647552967, "rewards/accuracy_reward": 0.9253167510032654, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1262, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 433.796875, "epoch": 0.08691762438923681, "grad_norm": 0.0, "kl": 0.65625, "learning_rate": 9.814751246117338e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1263, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 424.046875, "epoch": 0.08698644277751015, "grad_norm": 0.0, "kl": 0.65234375, "learning_rate": 9.814459611371346e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1264, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 497.546875, "epoch": 0.08705526116578349, "grad_norm": 0.5952867152552478, "kl": 0.609375, "learning_rate": 9.814167751587134e-07, "loss": 0.0, "reward": 2.53125, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1265, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 524.59375, "epoch": 0.08712407955405685, "grad_norm": 0.9545574972240146, "kl": 0.58203125, "learning_rate": 9.813875666778343e-07, "loss": 0.0, "reward": 2.0452897548675537, "reward_std": 0.00627532321959734, "rewards/accuracy_reward": 0.8452896475791931, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1266, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 424.125, "epoch": 0.08719289794233019, "grad_norm": 0.0, "kl": 0.64453125, "learning_rate": 9.813583356958625e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1267, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 469.6875, "epoch": 0.08726171633060353, "grad_norm": 0.45577503340583553, "kl": 0.69921875, "learning_rate": 9.813290822141647e-07, "loss": 0.0, "reward": 2.1279640197753906, "reward_std": 0.004341209307312965, "rewards/accuracy_reward": 0.9279638528823853, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1268, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 466.515625, "epoch": 0.08733053471887689, "grad_norm": 0.0, "kl": 0.62890625, "learning_rate": 9.812998062341076e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1269, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 475.171875, "epoch": 0.08739935310715023, "grad_norm": 0.6715821420581318, "kl": 0.6171875, "learning_rate": 9.812705077570601e-07, "loss": 0.0, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1270, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 437.578125, "epoch": 0.08746817149542357, "grad_norm": 0.5217794269648012, "kl": 0.625, "learning_rate": 9.812411867843918e-07, "loss": 0.0, "reward": 2.1232924461364746, "reward_std": 0.004419825505465269, "rewards/accuracy_reward": 0.9232925176620483, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1271, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 477.15625, "epoch": 0.08753698988369693, "grad_norm": 0.8273781396118047, "kl": 0.71484375, "learning_rate": 9.812118433174727e-07, "loss": -0.0, "reward": 1.9375, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1272, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 470.71875, "epoch": 0.08760580827197027, "grad_norm": 0.9182423871942043, "kl": 0.6953125, "learning_rate": 9.811824773576748e-07, "loss": 0.0, "reward": 2.2860541343688965, "reward_std": 0.06185869872570038, "rewards/accuracy_reward": 0.6391793489456177, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1273, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 469.34375, "epoch": 0.08767462666024362, "grad_norm": 0.0, "kl": 0.6171875, "learning_rate": 9.811530889063707e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1274, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.5625, "epoch": 0.08774344504851696, "grad_norm": 0.4946561040455413, "kl": 0.625, "learning_rate": 9.811236779649336e-07, "loss": -0.0, "reward": 2.402083396911621, "reward_std": 0.005892553832381964, "rewards/accuracy_reward": 0.7520833015441895, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1275, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 444.53125, "epoch": 0.08781226343679031, "grad_norm": 0.9718583285965576, "kl": 0.62109375, "learning_rate": 9.81094244534739e-07, "loss": -0.0, "reward": 2.4084010124206543, "reward_std": 0.13161706924438477, "rewards/accuracy_reward": 0.7552760243415833, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 1276, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 423.53125, "epoch": 0.08788108182506366, "grad_norm": 0.5432960598144128, "kl": 0.65625, "learning_rate": 9.810647886171622e-07, "loss": -0.0, "reward": 2.6589016914367676, "reward_std": 0.008294622413814068, "rewards/accuracy_reward": 0.9589017629623413, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1277, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 491.59375, "epoch": 0.087949900213337, "grad_norm": 0.6991431579220323, "kl": 0.6484375, "learning_rate": 9.8103531021358e-07, "loss": -0.0, "reward": 2.4997751712799072, "reward_std": 0.06817067414522171, "rewards/accuracy_reward": 0.8341501355171204, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.484375, "step": 1278, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 422.484375, "epoch": 0.08801871860161035, "grad_norm": 0.5437381771676798, "kl": 0.65625, "learning_rate": 9.810058093253706e-07, "loss": 0.0, "reward": 2.046875, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1279, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 446.3125, "epoch": 0.0880875369898837, "grad_norm": 0.8437860840926663, "kl": 0.65625, "learning_rate": 9.809762859539125e-07, "loss": 0.0, "reward": 2.435328483581543, "reward_std": 0.055945418775081635, "rewards/accuracy_reward": 0.7634533643722534, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1280, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 457.765625, "epoch": 0.08815635537815704, "grad_norm": 2.3754604266459007, "kl": 0.6953125, "learning_rate": 9.809467401005862e-07, "loss": 0.0, "reward": 2.2125000953674316, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1281, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 500.328125, "epoch": 0.0882251737664304, "grad_norm": 0.7630213232312361, "kl": 0.6875, "learning_rate": 9.809171717667723e-07, "loss": -0.0, "reward": 1.8591547012329102, "reward_std": 0.07365895062685013, "rewards/accuracy_reward": 0.7029046416282654, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1282, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 434.53125, "epoch": 0.08829399215470374, "grad_norm": 0.0, "kl": 0.65234375, "learning_rate": 9.808875809538532e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1283, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 458.0625, "epoch": 0.08836281054297708, "grad_norm": 0.41165863514786666, "kl": 0.6328125, "learning_rate": 9.808579676632119e-07, "loss": 0.0, "reward": 1.949097990989685, "reward_std": 0.07044535875320435, "rewards/accuracy_reward": 0.7928479313850403, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1284, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 503.390625, "epoch": 0.08843162893125044, "grad_norm": 0.4900563412283271, "kl": 0.640625, "learning_rate": 9.808283318962322e-07, "loss": 0.0, "reward": 2.510435104370117, "reward_std": 0.016525836661458015, "rewards/accuracy_reward": 0.8354349136352539, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1285, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 451.375, "epoch": 0.08850044731952378, "grad_norm": 0.6961970145673538, "kl": 0.62109375, "learning_rate": 9.807986736543e-07, "loss": 0.0, "reward": 2.1208438873291016, "reward_std": 0.01566043123602867, "rewards/accuracy_reward": 0.920843780040741, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1286, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 528.65625, "epoch": 0.08856926570779712, "grad_norm": 1.2318354289035467, "kl": 0.6484375, "learning_rate": 9.807689929388015e-07, "loss": -0.0, "reward": 1.889380693435669, "reward_std": 0.0766715481877327, "rewards/accuracy_reward": 0.717505693435669, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1287, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 475.828125, "epoch": 0.08863808409607048, "grad_norm": 0.5818189138152061, "kl": 0.625, "learning_rate": 9.807392897511237e-07, "loss": 0.0, "reward": 2.53125, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1288, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 517.078125, "epoch": 0.08870690248434382, "grad_norm": 0.8305040375333329, "kl": 0.66015625, "learning_rate": 9.80709564092655e-07, "loss": -0.0, "reward": 1.8241829872131348, "reward_std": 0.043147776275873184, "rewards/accuracy_reward": 0.32574549317359924, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3984375, "step": 1289, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 505.21875, "epoch": 0.08877572087261716, "grad_norm": 0.9192934707875692, "kl": 0.59765625, "learning_rate": 9.806798159647853e-07, "loss": 0.0, "reward": 1.990694284439087, "reward_std": 0.047222770750522614, "rewards/accuracy_reward": 0.8156942129135132, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1290, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 527.640625, "epoch": 0.0888445392608905, "grad_norm": 0.4486040391181284, "kl": 0.6953125, "learning_rate": 9.806500453689046e-07, "loss": 0.0, "reward": 1.696461796760559, "reward_std": 0.011491692624986172, "rewards/accuracy_reward": 0.5714617967605591, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1291, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 520.046875, "epoch": 0.08891335764916386, "grad_norm": 0.4076870165662958, "kl": 0.6015625, "learning_rate": 9.806202523064044e-07, "loss": -0.0, "reward": 2.472808837890625, "reward_std": 0.0017793537117540836, "rewards/accuracy_reward": 0.7978086471557617, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1292, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 523.03125, "epoch": 0.0889821760374372, "grad_norm": 0.9768210924678171, "kl": 0.69921875, "learning_rate": 9.805904367786778e-07, "loss": 0.0, "reward": 1.6634318828582764, "reward_std": 0.09680310636758804, "rewards/accuracy_reward": 0.547806978225708, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1293, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 549.8125, "epoch": 0.08905099442571054, "grad_norm": 0.4762179733905738, "kl": 0.66796875, "learning_rate": 9.805605987871183e-07, "loss": 0.0, "reward": 2.488772392272949, "reward_std": 0.002441114280372858, "rewards/accuracy_reward": 0.9137722253799438, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1294, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 543.5625, "epoch": 0.0891198128139839, "grad_norm": 0.9188652674679031, "kl": 0.6640625, "learning_rate": 9.805307383331201e-07, "loss": 0.0, "reward": 2.2125000953674316, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1295, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 508.640625, "epoch": 0.08918863120225724, "grad_norm": 0.842188070372446, "kl": 0.59375, "learning_rate": 9.805008554180794e-07, "loss": -0.0, "reward": 1.6281417608261108, "reward_std": 0.07403510063886642, "rewards/accuracy_reward": 0.49689173698425293, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1296, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 496.234375, "epoch": 0.08925744959053059, "grad_norm": 0.811757098895209, "kl": 0.61328125, "learning_rate": 9.80470950043393e-07, "loss": 0.0, "reward": 2.045710563659668, "reward_std": 0.08981916308403015, "rewards/accuracy_reward": 0.8644604682922363, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1297, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 498.421875, "epoch": 0.08932626797880394, "grad_norm": 0.6538126619826302, "kl": 0.703125, "learning_rate": 9.804410222104584e-07, "loss": 0.0, "reward": 2.5421876907348633, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 1298, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 501.421875, "epoch": 0.08939508636707728, "grad_norm": 0.47327152391129845, "kl": 0.640625, "learning_rate": 9.804110719206748e-07, "loss": 0.0, "reward": 1.8173134326934814, "reward_std": 0.00230008689686656, "rewards/accuracy_reward": 0.6673133373260498, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1299, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 502.46875, "epoch": 0.08946390475535063, "grad_norm": 0.0, "kl": 0.6328125, "learning_rate": 9.80381099175442e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1300, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 550.75, "epoch": 0.08953272314362398, "grad_norm": 1.010437415696544, "kl": 0.6171875, "learning_rate": 9.803511039761612e-07, "loss": -0.0, "reward": 2.5472707748413086, "reward_std": 0.008736895397305489, "rewards/accuracy_reward": 0.8472709059715271, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1301, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 468.0, "epoch": 0.08960154153189732, "grad_norm": 0.0, "kl": 0.671875, "learning_rate": 9.803210863242343e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1302, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 489.796875, "epoch": 0.08967035992017067, "grad_norm": 0.7051604951297898, "kl": 0.625, "learning_rate": 9.802910462210643e-07, "loss": 0.0, "reward": 1.974198341369629, "reward_std": 0.0018715625628829002, "rewards/accuracy_reward": 0.7991982698440552, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1303, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 478.0625, "epoch": 0.08973917830844402, "grad_norm": 7.898697767191134, "kl": 0.65234375, "learning_rate": 9.802609836680553e-07, "loss": 0.0, "reward": 2.046875, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1304, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 450.921875, "epoch": 0.08980799669671737, "grad_norm": 0.6963993074979794, "kl": 0.65625, "learning_rate": 9.802308986666128e-07, "loss": -0.0, "reward": 1.834372878074646, "reward_std": 0.047308407723903656, "rewards/accuracy_reward": 0.6843729019165039, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1305, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 451.75, "epoch": 0.08987681508499071, "grad_norm": 0.0, "kl": 0.6640625, "learning_rate": 9.802007912181427e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1306, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 457.71875, "epoch": 0.08994563347326405, "grad_norm": 0.6582556902188492, "kl": 0.7421875, "learning_rate": 9.801706613240525e-07, "loss": 0.0, "reward": 2.380654811859131, "reward_std": 0.033407654613256454, "rewards/accuracy_reward": 0.761904776096344, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.46875, "step": 1307, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 445.65625, "epoch": 0.0900144518615374, "grad_norm": 0.0, "kl": 0.77734375, "learning_rate": 9.801405089857504e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1308, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 450.140625, "epoch": 0.09008327024981075, "grad_norm": 0.0, "kl": 0.65234375, "learning_rate": 9.801103342046459e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1309, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 457.90625, "epoch": 0.09015208863808409, "grad_norm": 0.6079264188319006, "kl": 0.66796875, "learning_rate": 9.800801369821494e-07, "loss": -0.0, "reward": 2.6437501907348633, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1310, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 450.59375, "epoch": 0.09022090702635745, "grad_norm": 0.6809083297559457, "kl": 0.71875, "learning_rate": 9.800499173196724e-07, "loss": 0.0, "reward": 2.2923269271850586, "reward_std": 0.024716829881072044, "rewards/accuracy_reward": 0.7595144510269165, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3828125, "step": 1311, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 473.609375, "epoch": 0.09028972541463079, "grad_norm": 0.8508444265649753, "kl": 0.6171875, "learning_rate": 9.800196752186273e-07, "loss": 0.0, "reward": 2.5769879817962646, "reward_std": 0.08253377676010132, "rewards/accuracy_reward": 0.8863629102706909, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1312, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 434.5625, "epoch": 0.09035854380290413, "grad_norm": 12.239030209682387, "kl": 0.640625, "learning_rate": 9.799894106804278e-07, "loss": -0.0, "reward": 2.6437501907348633, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1313, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 447.546875, "epoch": 0.09042736219117749, "grad_norm": 0.997513560343422, "kl": 0.75390625, "learning_rate": 9.799591237064886e-07, "loss": -0.0, "reward": 1.582167625427246, "reward_std": 0.08114176243543625, "rewards/accuracy_reward": 0.47279250621795654, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1314, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 457.453125, "epoch": 0.09049618057945083, "grad_norm": 0.45256356439201395, "kl": 0.67578125, "learning_rate": 9.799288142982251e-07, "loss": -0.0, "reward": 2.14113712310791, "reward_std": 0.023799192160367966, "rewards/accuracy_reward": 0.9411370754241943, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1315, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 409.59375, "epoch": 0.09056499896772417, "grad_norm": 0.0, "kl": 0.6875, "learning_rate": 9.798984824570545e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1316, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 425.03125, "epoch": 0.09063381735599753, "grad_norm": 0.0, "kl": 0.6875, "learning_rate": 9.79868128184394e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1317, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 453.65625, "epoch": 0.09070263574427087, "grad_norm": 0.845674428867641, "kl": 0.71875, "learning_rate": 9.79837751481663e-07, "loss": -0.0, "reward": 2.6272096633911133, "reward_std": 0.025469521060585976, "rewards/accuracy_reward": 0.9350221157073975, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 1318, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 442.53125, "epoch": 0.09077145413254421, "grad_norm": 1.1870998388082865, "kl": 0.67578125, "learning_rate": 9.798073523502809e-07, "loss": 0.0, "reward": 2.1556596755981445, "reward_std": 0.017199959605932236, "rewards/accuracy_reward": 0.958784818649292, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1319, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 441.71875, "epoch": 0.09084027252081757, "grad_norm": 0.40262069578366455, "kl": 0.67578125, "learning_rate": 9.797769307916688e-07, "loss": -0.0, "reward": 2.1399388313293457, "reward_std": 0.026886316016316414, "rewards/accuracy_reward": 0.9399389624595642, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1320, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 448.546875, "epoch": 0.09090909090909091, "grad_norm": 0.632763627602718, "kl": 0.671875, "learning_rate": 9.797464868072486e-07, "loss": 0.0, "reward": 2.0166666507720947, "reward_std": 0.020573778077960014, "rewards/accuracy_reward": 0.8416666984558105, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1321, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 425.265625, "epoch": 0.09097790929736425, "grad_norm": 0.45894414580960735, "kl": 0.64453125, "learning_rate": 9.797160203984437e-07, "loss": 0.0, "reward": 2.1099867820739746, "reward_std": 0.004118278156965971, "rewards/accuracy_reward": 0.9099867939949036, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1322, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 430.546875, "epoch": 0.0910467276856376, "grad_norm": 2.864248315156742, "kl": 0.65625, "learning_rate": 9.796855315666776e-07, "loss": -0.0, "reward": 2.633049488067627, "reward_std": 0.005658179055899382, "rewards/accuracy_reward": 0.9330494999885559, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1323, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 407.671875, "epoch": 0.09111554607391095, "grad_norm": 1.1904613866518141, "kl": 0.70703125, "learning_rate": 9.796550203133757e-07, "loss": 0.0, "reward": 2.25991153717041, "reward_std": 0.08324345201253891, "rewards/accuracy_reward": 0.6317866444587708, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1324, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 446.515625, "epoch": 0.0911843644621843, "grad_norm": 0.0, "kl": 0.64453125, "learning_rate": 9.79624486639964e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1325, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.25, "epoch": 0.09125318285045764, "grad_norm": 1.212958346282613, "kl": 0.640625, "learning_rate": 9.795939305478701e-07, "loss": 0.0, "reward": 1.9970290660858154, "reward_std": 0.09154713153839111, "rewards/accuracy_reward": 0.8189040422439575, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1326, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 450.0, "epoch": 0.091322001238731, "grad_norm": 1.0047888222226509, "kl": 0.66796875, "learning_rate": 9.79563352038522e-07, "loss": 0.0, "reward": 1.9187500476837158, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1327, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 468.6875, "epoch": 0.09139081962700434, "grad_norm": 0.9007986614258526, "kl": 0.6875, "learning_rate": 9.79532751113349e-07, "loss": -0.0, "reward": 1.7687500715255737, "reward_std": 0.1578107476234436, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1328, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 454.296875, "epoch": 0.09145963801527768, "grad_norm": 0.5189491685238862, "kl": 0.72265625, "learning_rate": 9.795021277737814e-07, "loss": 0.0, "reward": 2.650242328643799, "reward_std": 0.008553924970328808, "rewards/accuracy_reward": 0.9502423405647278, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1329, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 491.046875, "epoch": 0.09152845640355103, "grad_norm": 0.8256542485125435, "kl": 0.6875, "learning_rate": 9.794714820212506e-07, "loss": 0.0, "reward": 2.0578126907348633, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0078125, "step": 1330, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 464.390625, "epoch": 0.09159727479182438, "grad_norm": 0.7589375075603116, "kl": 0.6484375, "learning_rate": 9.794408138571893e-07, "loss": 0.0, "reward": 2.4675869941711426, "reward_std": 0.0035863430239260197, "rewards/accuracy_reward": 0.7925870418548584, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1331, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 455.953125, "epoch": 0.09166609318009772, "grad_norm": 0.0, "kl": 0.6484375, "learning_rate": 9.794101232830306e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1332, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 502.03125, "epoch": 0.09173491156837107, "grad_norm": 0.0, "kl": 0.62890625, "learning_rate": 9.793794103002094e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1333, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 455.453125, "epoch": 0.09180372995664442, "grad_norm": 0.4995840776221721, "kl": 0.6484375, "learning_rate": 9.79348674910161e-07, "loss": -0.0, "reward": 2.4461755752563477, "reward_std": 0.004264346789568663, "rewards/accuracy_reward": 0.7711755633354187, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1334, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 464.375, "epoch": 0.09187254834491776, "grad_norm": 0.0, "kl": 0.65234375, "learning_rate": 9.793179171143223e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1335, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 465.40625, "epoch": 0.09194136673319112, "grad_norm": 0.5985855150999064, "kl": 0.66796875, "learning_rate": 9.792871369141309e-07, "loss": -0.0, "reward": 2.5656251907348633, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1336, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 481.625, "epoch": 0.09201018512146446, "grad_norm": 0.37966815742142535, "kl": 0.64453125, "learning_rate": 9.792563343110253e-07, "loss": -0.0, "reward": 1.9635138511657715, "reward_std": 0.0011787739349529147, "rewards/accuracy_reward": 0.7885138392448425, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1337, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 514.828125, "epoch": 0.0920790035097378, "grad_norm": 9.831227871489707, "kl": 0.65625, "learning_rate": 9.792255093064457e-07, "loss": 0.0, "reward": 2.002027988433838, "reward_std": 0.14670932292938232, "rewards/accuracy_reward": 0.8364028930664062, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1338, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 485.78125, "epoch": 0.09214782189801114, "grad_norm": 0.0, "kl": 0.63671875, "learning_rate": 9.791946619018327e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1339, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 503.296875, "epoch": 0.0922166402862845, "grad_norm": 0.0, "kl": 0.6484375, "learning_rate": 9.79163792098628e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1340, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 488.296875, "epoch": 0.09228545867455784, "grad_norm": 0.0, "kl": 0.6484375, "learning_rate": 9.79132899898275e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1341, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 515.640625, "epoch": 0.09235427706283118, "grad_norm": 0.0, "kl": 0.72265625, "learning_rate": 9.79101985302217e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1342, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 508.109375, "epoch": 0.09242309545110454, "grad_norm": 0.0, "kl": 0.70703125, "learning_rate": 9.790710483118996e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1343, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 522.5, "epoch": 0.09249191383937788, "grad_norm": 1.133821317158792, "kl": 0.6875, "learning_rate": 9.790400889287685e-07, "loss": 0.0, "reward": 2.3546876907348633, "reward_std": 0.06629125773906708, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3046875, "step": 1344, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 539.15625, "epoch": 0.09256073222765122, "grad_norm": 0.5278093447207269, "kl": 0.609375, "learning_rate": 9.790091071542711e-07, "loss": -0.0, "reward": 1.8704696893692017, "reward_std": 0.18549489974975586, "rewards/accuracy_reward": 0.7454697489738464, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 1345, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 537.15625, "epoch": 0.09262955061592458, "grad_norm": 0.8602005709664617, "kl": 0.62890625, "learning_rate": 9.789781029898555e-07, "loss": -0.0, "reward": 1.603124976158142, "reward_std": 0.16666369140148163, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1346, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 502.015625, "epoch": 0.09269836900419792, "grad_norm": 0.8882718034129515, "kl": 0.68359375, "learning_rate": 9.789470764369707e-07, "loss": 0.0, "reward": 2.4025425910949707, "reward_std": 0.15829665958881378, "rewards/accuracy_reward": 0.8009800314903259, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4453125, "step": 1347, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 536.0, "epoch": 0.09276718739247126, "grad_norm": 0.5063706720746719, "kl": 0.68359375, "learning_rate": 9.78916027497067e-07, "loss": -0.0, "reward": 2.1437501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1348, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 552.046875, "epoch": 0.09283600578074462, "grad_norm": 0.8135691296169767, "kl": 0.6015625, "learning_rate": 9.788849561715957e-07, "loss": -0.0, "reward": 2.4611942768096924, "reward_std": 0.25232481956481934, "rewards/accuracy_reward": 0.8143192529678345, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.484375, "step": 1349, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 504.875, "epoch": 0.09290482416901796, "grad_norm": 0.6446570357082827, "kl": 0.671875, "learning_rate": 9.788538624620092e-07, "loss": 0.0, "reward": 2.1617696285247803, "reward_std": 0.0536167174577713, "rewards/accuracy_reward": 0.6680195331573486, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.34375, "step": 1350, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 498.140625, "epoch": 0.0929736425572913, "grad_norm": 0.0, "kl": 0.63671875, "learning_rate": 9.788227463697612e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1351, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 449.96875, "epoch": 0.09304246094556466, "grad_norm": 0.8934289988461832, "kl": 0.79296875, "learning_rate": 9.787916078963055e-07, "loss": -0.0, "reward": 2.5031251907348633, "reward_std": 0.0289318785071373, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.453125, "step": 1352, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 500.734375, "epoch": 0.093111279333838, "grad_norm": 0.0, "kl": 0.625, "learning_rate": 9.78760447043098e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1353, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 471.5625, "epoch": 0.09318009772211135, "grad_norm": 0.6794469206223788, "kl": 0.640625, "learning_rate": 9.78729263811595e-07, "loss": -0.0, "reward": 2.408158779144287, "reward_std": 0.08445257693529129, "rewards/accuracy_reward": 0.7487836480140686, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1354, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 509.34375, "epoch": 0.09324891611038469, "grad_norm": 0.5658595439276989, "kl": 0.67578125, "learning_rate": 9.786980582032544e-07, "loss": 0.0, "reward": 2.4937500953674316, "reward_std": 0.12483581155538559, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.328125, "step": 1355, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 443.3125, "epoch": 0.09331773449865804, "grad_norm": 1.4491950054057536, "kl": 0.765625, "learning_rate": 9.786668302195344e-07, "loss": 0.0, "reward": 2.4634640216827393, "reward_std": 0.010965334251523018, "rewards/accuracy_reward": 0.8259639739990234, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1356, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 507.265625, "epoch": 0.09338655288693139, "grad_norm": 0.5527542295396702, "kl": 0.59375, "learning_rate": 9.78635579861895e-07, "loss": 0.0, "reward": 2.4557113647460938, "reward_std": 0.004248400684446096, "rewards/accuracy_reward": 0.7807111740112305, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1357, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 494.171875, "epoch": 0.09345537127520473, "grad_norm": 0.49676671301442615, "kl": 0.62890625, "learning_rate": 9.786043071317967e-07, "loss": -0.0, "reward": 2.0875000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1358, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 507.0, "epoch": 0.09352418966347809, "grad_norm": 0.0, "kl": 0.5859375, "learning_rate": 9.785730120307013e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1359, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 491.21875, "epoch": 0.09359300805175143, "grad_norm": 0.5526953766295908, "kl": 0.609375, "learning_rate": 9.78541694560072e-07, "loss": 0.0, "reward": 1.993749976158142, "reward_std": 0.12246951460838318, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1360, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 439.78125, "epoch": 0.09366182644002477, "grad_norm": 0.49573587776190164, "kl": 0.7734375, "learning_rate": 9.78510354721372e-07, "loss": 0.0, "reward": 2.468684673309326, "reward_std": 0.0028001158498227596, "rewards/accuracy_reward": 0.7936844825744629, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1361, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 449.078125, "epoch": 0.09373064482829813, "grad_norm": 1.0275222617844173, "kl": 0.74609375, "learning_rate": 9.784789925160663e-07, "loss": -0.0, "reward": 2.386446952819824, "reward_std": 0.011405516415834427, "rewards/accuracy_reward": 0.7739467620849609, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1362, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 480.734375, "epoch": 0.09379946321657147, "grad_norm": 0.0, "kl": 0.59765625, "learning_rate": 9.784476079456213e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1363, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 458.03125, "epoch": 0.09386828160484481, "grad_norm": 0.0, "kl": 0.6875, "learning_rate": 9.784162010115036e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1364, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 490.1875, "epoch": 0.09393709999311817, "grad_norm": 0.0, "kl": 0.71484375, "learning_rate": 9.783847717151812e-07, "loss": 0.0, "reward": 2.4875001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1365, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 448.0, "epoch": 0.09400591838139151, "grad_norm": 0.5452874658013281, "kl": 0.66796875, "learning_rate": 9.783533200581235e-07, "loss": 0.0, "reward": 1.9891386032104492, "reward_std": 0.005113601218909025, "rewards/accuracy_reward": 0.8141385316848755, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1366, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 490.484375, "epoch": 0.09407473676966485, "grad_norm": 0.8711450017651692, "kl": 0.703125, "learning_rate": 9.783218460418003e-07, "loss": -0.0, "reward": 2.2093749046325684, "reward_std": 0.12175466865301132, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.421875, "step": 1367, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 489.828125, "epoch": 0.09414355515793821, "grad_norm": 0.4946714253684353, "kl": 0.6015625, "learning_rate": 9.782903496676829e-07, "loss": 0.0, "reward": 2.0037119388580322, "reward_std": 0.004572661593556404, "rewards/accuracy_reward": 0.8287118077278137, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1368, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 469.703125, "epoch": 0.09421237354621155, "grad_norm": 1.004123605895756, "kl": 0.66796875, "learning_rate": 9.782588309372434e-07, "loss": 0.0, "reward": 2.47747540473938, "reward_std": 0.06373319029808044, "rewards/accuracy_reward": 0.8056004047393799, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1369, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 480.8125, "epoch": 0.09428119193448489, "grad_norm": 0.6828153070722099, "kl": 0.6328125, "learning_rate": 9.782272898519553e-07, "loss": -0.0, "reward": 1.9019745588302612, "reward_std": 0.0894605740904808, "rewards/accuracy_reward": 0.7425994873046875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1370, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 476.734375, "epoch": 0.09435001032275823, "grad_norm": 0.8290177752236805, "kl": 0.6328125, "learning_rate": 9.781957264132925e-07, "loss": 0.0, "reward": 2.5434656143188477, "reward_std": 0.08320365846157074, "rewards/accuracy_reward": 0.8559654951095581, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1371, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 444.421875, "epoch": 0.09441882871103159, "grad_norm": 0.8382921244976885, "kl": 0.76171875, "learning_rate": 9.781641406227307e-07, "loss": 0.0, "reward": 2.6265625953674316, "reward_std": 0.030935924500226974, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4296875, "step": 1372, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 436.0625, "epoch": 0.09448764709930493, "grad_norm": 0.6378599541387479, "kl": 0.6953125, "learning_rate": 9.781325324817462e-07, "loss": -0.0, "reward": 2.628872871398926, "reward_std": 0.001284443773329258, "rewards/accuracy_reward": 0.92887282371521, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1373, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 436.96875, "epoch": 0.09455646548757828, "grad_norm": 0.5293159402583625, "kl": 0.7421875, "learning_rate": 9.781009019918165e-07, "loss": -0.0, "reward": 1.9375, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1374, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 488.625, "epoch": 0.09462528387585163, "grad_norm": 0.9027343363522639, "kl": 0.609375, "learning_rate": 9.7806924915442e-07, "loss": 0.0, "reward": 1.790411353111267, "reward_std": 0.08076009154319763, "rewards/accuracy_reward": 0.6341614127159119, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1375, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 438.265625, "epoch": 0.09469410226412497, "grad_norm": 1.4513711111502958, "kl": 0.73828125, "learning_rate": 9.78037573971036e-07, "loss": 0.0, "reward": 2.203214168548584, "reward_std": 0.08246174454689026, "rewards/accuracy_reward": 0.5594642162322998, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1376, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 468.8125, "epoch": 0.09476292065239832, "grad_norm": 0.5556668567996516, "kl": 0.67578125, "learning_rate": 9.780058764431453e-07, "loss": -0.0, "reward": 2.474498987197876, "reward_std": 0.003033955581486225, "rewards/accuracy_reward": 0.799498975276947, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1377, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 487.390625, "epoch": 0.09483173904067167, "grad_norm": 0.0, "kl": 0.6328125, "learning_rate": 9.779741565722295e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1378, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 471.34375, "epoch": 0.09490055742894501, "grad_norm": 0.9080135561217839, "kl": 0.65234375, "learning_rate": 9.779424143597712e-07, "loss": -0.0, "reward": 2.370049476623535, "reward_std": 0.08096026629209518, "rewards/accuracy_reward": 0.7106744050979614, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1379, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 410.921875, "epoch": 0.09496937581721836, "grad_norm": 1.6794635715465425, "kl": 0.78515625, "learning_rate": 9.779106498072541e-07, "loss": 0.0, "reward": 2.581728935241699, "reward_std": 0.0925736278295517, "rewards/accuracy_reward": 0.8942289352416992, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1380, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 499.125, "epoch": 0.09503819420549171, "grad_norm": 2.107717026392813, "kl": 0.6328125, "learning_rate": 9.77878862916163e-07, "loss": 0.0, "reward": 1.8250000476837158, "reward_std": 0.08017838001251221, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1381, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 470.703125, "epoch": 0.09510701259376506, "grad_norm": 0.9965911737935469, "kl": 0.65625, "learning_rate": 9.77847053687984e-07, "loss": 0.0, "reward": 2.542982816696167, "reward_std": 0.020343486219644547, "rewards/accuracy_reward": 0.8461078405380249, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1382, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 487.28125, "epoch": 0.0951758309820384, "grad_norm": 0.5360080341394148, "kl": 0.703125, "learning_rate": 9.778152221242031e-07, "loss": -0.0, "reward": 1.9937500953674316, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1383, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 480.90625, "epoch": 0.09524464937031175, "grad_norm": 0.43181090842706854, "kl": 0.640625, "learning_rate": 9.77783368226309e-07, "loss": 0.0, "reward": 1.8295502662658691, "reward_std": 0.007648163475096226, "rewards/accuracy_reward": 0.6795501708984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1384, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 491.796875, "epoch": 0.0953134677585851, "grad_norm": 0.0, "kl": 0.609375, "learning_rate": 9.7775149199579e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1385, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 487.953125, "epoch": 0.09538228614685844, "grad_norm": 1.4319462085016108, "kl": 0.62890625, "learning_rate": 9.777195934341366e-07, "loss": 0.0, "reward": 2.0622692108154297, "reward_std": 0.008104191161692142, "rewards/accuracy_reward": 0.8622691631317139, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1386, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 494.15625, "epoch": 0.09545110453513178, "grad_norm": 0.0, "kl": 0.625, "learning_rate": 9.776876725428396e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1387, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 470.40625, "epoch": 0.09551992292340514, "grad_norm": 0.0, "kl": 0.6953125, "learning_rate": 9.776557293233908e-07, "loss": 0.0, "reward": 2.2916667461395264, "reward_std": 0.0, "rewards/accuracy_reward": 0.6416666507720947, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1388, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 475.421875, "epoch": 0.09558874131167848, "grad_norm": 1.420132779464532, "kl": 0.69140625, "learning_rate": 9.776237637772837e-07, "loss": -0.0, "reward": 2.03125, "reward_std": 0.22724726796150208, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1389, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 507.75, "epoch": 0.09565755969995182, "grad_norm": 0.804602545920819, "kl": 0.6484375, "learning_rate": 9.775917759060122e-07, "loss": -0.0, "reward": 2.41988205909729, "reward_std": 0.03613942489027977, "rewards/accuracy_reward": 0.7839444875717163, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4609375, "step": 1390, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 468.0, "epoch": 0.09572637808822518, "grad_norm": 3.085464325022229, "kl": 0.6484375, "learning_rate": 9.775597657110713e-07, "loss": -0.0, "reward": 1.9517686367034912, "reward_std": 0.020461687818169594, "rewards/accuracy_reward": 0.7798935174942017, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1391, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 485.546875, "epoch": 0.09579519647649852, "grad_norm": 0.5519907626344046, "kl": 0.63671875, "learning_rate": 9.775277331939577e-07, "loss": 0.0, "reward": 2.127082586288452, "reward_std": 0.004301133565604687, "rewards/accuracy_reward": 0.9270825386047363, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1392, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 488.796875, "epoch": 0.09586401486477186, "grad_norm": 0.43387235063601437, "kl": 0.62109375, "learning_rate": 9.774956783561684e-07, "loss": -0.0, "reward": 1.973430871963501, "reward_std": 0.0033643224742263556, "rewards/accuracy_reward": 0.7984308004379272, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1393, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 452.203125, "epoch": 0.09593283325304522, "grad_norm": 0.8333493889007814, "kl": 0.7109375, "learning_rate": 9.774636011992018e-07, "loss": 0.0, "reward": 2.3970251083374023, "reward_std": 0.16921177506446838, "rewards/accuracy_reward": 0.7907751202583313, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1394, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 445.203125, "epoch": 0.09600165164131856, "grad_norm": 0.0, "kl": 0.703125, "learning_rate": 9.774315017245568e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1395, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.0, "completion_length": 478.703125, "epoch": 0.0960704700295919, "grad_norm": 1.7447915376434642, "kl": 0.59765625, "learning_rate": 9.773993799337345e-07, "loss": -0.0, "reward": 1.974153995513916, "reward_std": 0.4846225380897522, "rewards/accuracy_reward": 0.47415393590927124, "rewards/format_reward": 0.90625, "rewards/transform_reward": 0.453125, "step": 1396, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 460.265625, "epoch": 0.09613928841786526, "grad_norm": 1.0088849542283733, "kl": 0.640625, "learning_rate": 9.77367235828236e-07, "loss": 0.0, "reward": 1.84375, "reward_std": 0.12246951460838318, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1397, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 451.453125, "epoch": 0.0962081068061386, "grad_norm": 2.049832573735569, "kl": 0.6328125, "learning_rate": 9.77335069409564e-07, "loss": 0.0, "reward": 2.040987968444824, "reward_std": 0.035654179751873016, "rewards/accuracy_reward": 0.8409878611564636, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1398, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 458.984375, "epoch": 0.09627692519441194, "grad_norm": 0.8858403936706548, "kl": 0.6640625, "learning_rate": 9.773028806792215e-07, "loss": 0.0, "reward": 2.28125, "reward_std": 0.1578107476234436, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.25, "step": 1399, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 455.421875, "epoch": 0.0963457435826853, "grad_norm": 0.0, "kl": 0.625, "learning_rate": 9.772706696387136e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1400, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 424.390625, "epoch": 0.09641456197095864, "grad_norm": 0.5289714832886332, "kl": 0.625, "learning_rate": 9.772384362895455e-07, "loss": 0.0, "reward": 2.046875, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1401, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 444.4375, "epoch": 0.09648338035923198, "grad_norm": 1.0550660123939686, "kl": 0.62890625, "learning_rate": 9.772061806332244e-07, "loss": -0.0, "reward": 2.0071587562561035, "reward_std": 0.07502136379480362, "rewards/accuracy_reward": 0.8102836608886719, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1402, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 464.0, "epoch": 0.09655219874750533, "grad_norm": 2.0382559972169725, "kl": 0.578125, "learning_rate": 9.771739026712574e-07, "loss": 0.0, "reward": 2.360335350036621, "reward_std": 0.07089250534772873, "rewards/accuracy_reward": 0.7040855288505554, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1403, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 441.21875, "epoch": 0.09662101713577868, "grad_norm": 0.7682408943470865, "kl": 0.57421875, "learning_rate": 9.77141602405154e-07, "loss": 0.0, "reward": 2.0180788040161133, "reward_std": 0.017338111996650696, "rewards/accuracy_reward": 0.8430788516998291, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1404, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 460.046875, "epoch": 0.09668983552405203, "grad_norm": 0.0, "kl": 0.640625, "learning_rate": 9.77109279836423e-07, "loss": 0.0, "reward": 2.445833206176758, "reward_std": 0.0, "rewards/accuracy_reward": 0.7708333730697632, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1405, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 443.0625, "epoch": 0.09675865391232537, "grad_norm": 0.0, "kl": 0.58984375, "learning_rate": 9.770769349665759e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1406, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.78125, "epoch": 0.09682747230059872, "grad_norm": 0.0, "kl": 0.609375, "learning_rate": 9.770445677971246e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1407, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 433.578125, "epoch": 0.09689629068887207, "grad_norm": 1.1422006939018174, "kl": 0.60546875, "learning_rate": 9.770121783295816e-07, "loss": 0.0, "reward": 1.9263403415679932, "reward_std": 0.09065379202365875, "rewards/accuracy_reward": 0.7638402581214905, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1408, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 421.15625, "epoch": 0.09696510907714541, "grad_norm": 0.8402766467132549, "kl": 0.65625, "learning_rate": 9.769797665654612e-07, "loss": 0.0, "reward": 1.896875023841858, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1409, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 437.234375, "epoch": 0.09703392746541877, "grad_norm": 1.5519226092143599, "kl": 0.55859375, "learning_rate": 9.76947332506278e-07, "loss": -0.0, "reward": 2.585268020629883, "reward_std": 0.0740416869521141, "rewards/accuracy_reward": 0.8915179967880249, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1410, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 461.421875, "epoch": 0.09710274585369211, "grad_norm": 0.4917368126557366, "kl": 0.65625, "learning_rate": 9.769148761535485e-07, "loss": -0.0, "reward": 1.8607773780822754, "reward_std": 0.0030322198290377855, "rewards/accuracy_reward": 0.6857773065567017, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1411, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 427.890625, "epoch": 0.09717156424196545, "grad_norm": 2.2821186805811333, "kl": 0.65234375, "learning_rate": 9.768823975087897e-07, "loss": -0.0, "reward": 2.449496269226074, "reward_std": 0.056574124842882156, "rewards/accuracy_reward": 0.7776214480400085, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1412, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 432.875, "epoch": 0.0972403826302388, "grad_norm": 0.7124473196366423, "kl": 0.6484375, "learning_rate": 9.768498965735193e-07, "loss": -0.0, "reward": 2.4750001430511475, "reward_std": 0.08017837256193161, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1413, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 422.734375, "epoch": 0.09730920101851215, "grad_norm": 0.0, "kl": 0.671875, "learning_rate": 9.76817373349257e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1414, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 436.25, "epoch": 0.09737801940678549, "grad_norm": 1.3552036996929724, "kl": 0.65234375, "learning_rate": 9.767848278375227e-07, "loss": 0.0, "reward": 2.1972804069519043, "reward_std": 0.17782065272331238, "rewards/accuracy_reward": 0.623842716217041, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4296875, "step": 1415, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 411.8125, "epoch": 0.09744683779505885, "grad_norm": 0.0, "kl": 0.67578125, "learning_rate": 9.767522600398376e-07, "loss": 0.0, "reward": 2.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1416, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 446.4375, "epoch": 0.09751565618333219, "grad_norm": 1.7522539737430738, "kl": 0.6484375, "learning_rate": 9.767196699577243e-07, "loss": 0.0, "reward": 2.5252845287323, "reward_std": 0.12477416545152664, "rewards/accuracy_reward": 0.864346981048584, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4765625, "step": 1417, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 420.0625, "epoch": 0.09758447457160553, "grad_norm": 1.402774724342387, "kl": 0.54296875, "learning_rate": 9.76687057592706e-07, "loss": -0.0, "reward": 1.9956495761871338, "reward_std": 0.08419714123010635, "rewards/accuracy_reward": 0.38627466559410095, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1418, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 425.609375, "epoch": 0.09765329295987887, "grad_norm": 0.704076515072999, "kl": 0.6875, "learning_rate": 9.766544229463066e-07, "loss": 0.0, "reward": 2.296875, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.25, "step": 1419, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 433.453125, "epoch": 0.09772211134815223, "grad_norm": 0.8628556578481013, "kl": 0.66015625, "learning_rate": 9.766217660200525e-07, "loss": -0.0, "reward": 2.0192174911499023, "reward_std": 0.05931995436549187, "rewards/accuracy_reward": 0.8285923004150391, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1420, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 440.828125, "epoch": 0.09779092973642557, "grad_norm": 0.43775792182975626, "kl": 0.68359375, "learning_rate": 9.765890868154692e-07, "loss": 0.0, "reward": 2.1448800563812256, "reward_std": 0.06878858804702759, "rewards/accuracy_reward": 0.5386300683021545, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 1421, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 428.6875, "epoch": 0.09785974812469891, "grad_norm": 0.5671563556311464, "kl": 0.70703125, "learning_rate": 9.765563853340846e-07, "loss": -0.0, "reward": 1.8627008199691772, "reward_std": 0.005865235347300768, "rewards/accuracy_reward": 0.6877007484436035, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1422, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 375.4375, "epoch": 0.09792856651297227, "grad_norm": 0.6554491408614632, "kl": 0.6875, "learning_rate": 9.765236615774273e-07, "loss": 0.0, "reward": 2.046875, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1423, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 405.671875, "epoch": 0.09799738490124561, "grad_norm": 0.8408489166036831, "kl": 0.70703125, "learning_rate": 9.764909155470269e-07, "loss": 0.0, "reward": 2.0718750953674316, "reward_std": 0.08901721239089966, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.25, "step": 1424, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 396.015625, "epoch": 0.09806620328951896, "grad_norm": 1.6049908960772292, "kl": 0.71875, "learning_rate": 9.764581472444137e-07, "loss": 0.0, "reward": 2.533813714981079, "reward_std": 0.02075936645269394, "rewards/accuracy_reward": 0.8431888818740845, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1425, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 421.578125, "epoch": 0.09813502167779231, "grad_norm": 0.6541475531109799, "kl": 0.73046875, "learning_rate": 9.764253566711196e-07, "loss": 0.0, "reward": 1.7578125, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0078125, "step": 1426, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 381.75, "epoch": 0.09820384006606565, "grad_norm": 1.2959428938094448, "kl": 0.6875, "learning_rate": 9.763925438286776e-07, "loss": 0.0, "reward": 2.043750047683716, "reward_std": 0.017677675932645798, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1427, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 421.890625, "epoch": 0.098272658454339, "grad_norm": 1.251322093316814, "kl": 0.75, "learning_rate": 9.763597087186207e-07, "loss": -0.0, "reward": 1.8250000476837158, "reward_std": 0.08017837256193161, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1428, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 392.703125, "epoch": 0.09834147684261235, "grad_norm": 1.0226041441223244, "kl": 0.59765625, "learning_rate": 9.763268513424842e-07, "loss": 0.0, "reward": 2.0374999046325684, "reward_std": 0.029250433668494225, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1429, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 410.21875, "epoch": 0.0984102952308857, "grad_norm": 0.777321672104695, "kl": 0.58984375, "learning_rate": 9.76293971701804e-07, "loss": -0.0, "reward": 2.606250047683716, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1430, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 408.75, "epoch": 0.09847911361915904, "grad_norm": 0.7628332484390152, "kl": 0.65234375, "learning_rate": 9.762610697981166e-07, "loss": 0.0, "reward": 2.106250047683716, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1431, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 433.5, "epoch": 0.09854793200743239, "grad_norm": 0.0, "kl": 0.55078125, "learning_rate": 9.762281456329604e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1432, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 424.296875, "epoch": 0.09861675039570574, "grad_norm": 0.0, "kl": 0.62890625, "learning_rate": 9.761951992078737e-07, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1433, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 457.59375, "epoch": 0.09868556878397908, "grad_norm": 0.0, "kl": 0.59375, "learning_rate": 9.761622305243971e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1434, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 468.875, "epoch": 0.09875438717225242, "grad_norm": 1.2465452241159187, "kl": 0.52734375, "learning_rate": 9.76129239584071e-07, "loss": -0.0, "reward": 1.9732441902160645, "reward_std": 0.0881616547703743, "rewards/accuracy_reward": 0.7857441306114197, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1435, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 470.71875, "epoch": 0.09882320556052578, "grad_norm": 1.0403656483818093, "kl": 0.52734375, "learning_rate": 9.760962263884382e-07, "loss": 0.0, "reward": 1.9069736003875732, "reward_std": 0.12806689739227295, "rewards/accuracy_reward": 0.7444735765457153, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1436, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 463.1875, "epoch": 0.09889202394879912, "grad_norm": 3.038530980613955, "kl": 0.71875, "learning_rate": 9.760631909390414e-07, "loss": -0.0, "reward": 2.1500000953674316, "reward_std": 0.12730026245117188, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0625, "step": 1437, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 457.109375, "epoch": 0.09896084233707246, "grad_norm": 0.6109306819019124, "kl": 0.74609375, "learning_rate": 9.760301332374246e-07, "loss": 0.0, "reward": 2.53125, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1438, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 502.0625, "epoch": 0.09902966072534582, "grad_norm": 0.6710534495089949, "kl": 0.53125, "learning_rate": 9.759970532851332e-07, "loss": 0.0, "reward": 2.6437501907348633, "reward_std": 0.12246952950954437, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1439, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 464.671875, "epoch": 0.09909847911361916, "grad_norm": 0.0, "kl": 0.62890625, "learning_rate": 9.759639510837134e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1440, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 475.984375, "epoch": 0.0991672975018925, "grad_norm": 1.0693694967920473, "kl": 0.6171875, "learning_rate": 9.759308266347126e-07, "loss": -0.0, "reward": 1.9375, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1441, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 519.28125, "epoch": 0.09923611589016586, "grad_norm": 1.3556488138813279, "kl": 0.67578125, "learning_rate": 9.75897679939679e-07, "loss": -0.0, "reward": 1.7796977758407593, "reward_std": 0.0797707587480545, "rewards/accuracy_reward": 0.6234477758407593, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1442, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 459.96875, "epoch": 0.0993049342784392, "grad_norm": 0.8362511793248774, "kl": 0.609375, "learning_rate": 9.758645110001617e-07, "loss": 0.0, "reward": 1.6750000715255737, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1443, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 487.546875, "epoch": 0.09937375266671254, "grad_norm": 0.9295861792007957, "kl": 0.71484375, "learning_rate": 9.758313198177115e-07, "loss": 0.0, "reward": 2.2453126907348633, "reward_std": 0.022097086533904076, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1953125, "step": 1444, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 480.203125, "epoch": 0.0994425710549859, "grad_norm": 1.2145498894915479, "kl": 0.5390625, "learning_rate": 9.757981063938796e-07, "loss": -0.0, "reward": 2.4156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1445, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 522.09375, "epoch": 0.09951138944325924, "grad_norm": 1.1147936540692598, "kl": 0.67578125, "learning_rate": 9.757648707302185e-07, "loss": 0.0, "reward": 2.274580955505371, "reward_std": 0.10319501161575317, "rewards/accuracy_reward": 0.7808308005332947, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.375, "step": 1446, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 466.515625, "epoch": 0.09958020783153258, "grad_norm": 0.0, "kl": 0.6171875, "learning_rate": 9.757316128282817e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1447, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 477.359375, "epoch": 0.09964902621980594, "grad_norm": 0.0, "kl": 0.71484375, "learning_rate": 9.756983326896239e-07, "loss": 0.0, "reward": 2.2750000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1448, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 482.546875, "epoch": 0.09971784460807928, "grad_norm": 0.39001308577751226, "kl": 0.59375, "learning_rate": 9.756650303158004e-07, "loss": -0.0, "reward": 1.977135419845581, "reward_std": 0.0026898395735770464, "rewards/accuracy_reward": 0.8021353483200073, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1449, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 482.015625, "epoch": 0.09978666299635262, "grad_norm": 1.3810103858801146, "kl": 0.53515625, "learning_rate": 9.756317057083679e-07, "loss": 0.0, "reward": 2.359332323074341, "reward_std": 0.1266455501317978, "rewards/accuracy_reward": 0.7062071561813354, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1450, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.5, "completion_length": 472.203125, "epoch": 0.09985548138462597, "grad_norm": 0.0, "kl": 0.58984375, "learning_rate": 9.755983588688843e-07, "loss": 0.0, "reward": 1.600000023841858, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1451, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 502.15625, "epoch": 0.09992429977289932, "grad_norm": 0.5625510852239798, "kl": 0.73046875, "learning_rate": 9.75564989798908e-07, "loss": -0.0, "reward": 2.2874999046325684, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1452, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 485.828125, "epoch": 0.09999311816117266, "grad_norm": 0.7348585792983925, "kl": 0.578125, "learning_rate": 9.75531598499999e-07, "loss": -0.0, "reward": 1.938725471496582, "reward_std": 0.07290271669626236, "rewards/accuracy_reward": 0.782475471496582, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1453, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 485.03125, "epoch": 0.100061936549446, "grad_norm": 0.5374531566230102, "kl": 0.58984375, "learning_rate": 9.754981849737177e-07, "loss": -0.0, "reward": 1.9375, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1454, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 469.46875, "epoch": 0.10013075493771936, "grad_norm": 2.4273686029521575, "kl": 0.578125, "learning_rate": 9.754647492216265e-07, "loss": -0.0, "reward": 1.8250000476837158, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1455, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 507.390625, "epoch": 0.1001995733259927, "grad_norm": 1.3205828892550364, "kl": 0.4765625, "learning_rate": 9.754312912452877e-07, "loss": 0.0, "reward": 2.4372212886810303, "reward_std": 0.13946714997291565, "rewards/accuracy_reward": 0.7590962648391724, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1456, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 527.53125, "epoch": 0.10026839171426605, "grad_norm": 0.5194442199557788, "kl": 0.478515625, "learning_rate": 9.753978110462656e-07, "loss": -0.0, "reward": 2.0814595222473145, "reward_std": 0.07338720560073853, "rewards/accuracy_reward": 0.8877096176147461, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1457, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 538.0, "epoch": 0.1003372101025394, "grad_norm": 0.6295062264226643, "kl": 0.486328125, "learning_rate": 9.75364308626125e-07, "loss": -0.0, "reward": 1.8596484661102295, "reward_std": 0.09990574419498444, "rewards/accuracy_reward": 0.7221482992172241, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 1458, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 488.8125, "epoch": 0.10040602849081275, "grad_norm": 1.163058018977142, "kl": 0.5546875, "learning_rate": 9.753307839864317e-07, "loss": -0.0, "reward": 1.9937500953674316, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1459, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 513.4375, "epoch": 0.10047484687908609, "grad_norm": 0.0, "kl": 0.498046875, "learning_rate": 9.75297237128753e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1460, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 542.421875, "epoch": 0.10054366526735944, "grad_norm": 1.1832470711496645, "kl": 0.63671875, "learning_rate": 9.752636680546568e-07, "loss": -0.0, "reward": 2.376681327819824, "reward_std": 0.036097731441259384, "rewards/accuracy_reward": 0.7954312562942505, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.40625, "step": 1461, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 483.046875, "epoch": 0.10061248365563279, "grad_norm": 1.1165821129278748, "kl": 0.66796875, "learning_rate": 9.752300767657121e-07, "loss": -0.0, "reward": 2.3249998092651367, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1462, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 505.734375, "epoch": 0.10068130204390613, "grad_norm": 0.5333794974412591, "kl": 0.62109375, "learning_rate": 9.751964632634893e-07, "loss": 0.0, "reward": 2.6307239532470703, "reward_std": 0.002202628180384636, "rewards/accuracy_reward": 0.930724024772644, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1463, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 569.5, "epoch": 0.10075012043217949, "grad_norm": 2.401896265568457, "kl": 0.5859375, "learning_rate": 9.751628275495591e-07, "loss": -0.0, "reward": 1.6089978218078613, "reward_std": 0.2378087341785431, "rewards/accuracy_reward": 0.48712271451950073, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 1464, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 547.46875, "epoch": 0.10081893882045283, "grad_norm": 0.7280604413958499, "kl": 0.48046875, "learning_rate": 9.751291696254945e-07, "loss": -0.0, "reward": 2.3511736392974854, "reward_std": 0.11903824657201767, "rewards/accuracy_reward": 0.7152361273765564, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1465, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 508.65625, "epoch": 0.10088775720872617, "grad_norm": 1.2370851767794324, "kl": 0.65625, "learning_rate": 9.750954894928678e-07, "loss": -0.0, "reward": 2.1796875, "reward_std": 0.0765409916639328, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4140625, "step": 1466, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 501.5, "epoch": 0.10095657559699951, "grad_norm": 0.7874903760881754, "kl": 0.6484375, "learning_rate": 9.75061787153254e-07, "loss": 0.0, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1467, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 485.25, "epoch": 0.10102539398527287, "grad_norm": 0.6572973902791875, "kl": 0.4921875, "learning_rate": 9.75028062608228e-07, "loss": 0.0, "reward": 2.4513473510742188, "reward_std": 0.05611639842391014, "rewards/accuracy_reward": 0.779472291469574, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1468, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 478.15625, "epoch": 0.10109421237354621, "grad_norm": 0.0, "kl": 0.6640625, "learning_rate": 9.749943158593664e-07, "loss": 0.0, "reward": 2.4875001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1469, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 490.828125, "epoch": 0.10116303076181955, "grad_norm": 0.7080121033887112, "kl": 0.66015625, "learning_rate": 9.749605469082465e-07, "loss": -0.0, "reward": 2.5250000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1470, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 486.421875, "epoch": 0.10123184915009291, "grad_norm": 0.40856969117052827, "kl": 0.52734375, "learning_rate": 9.749267557564467e-07, "loss": 0.0, "reward": 1.6717029809951782, "reward_std": 0.009022252634167671, "rewards/accuracy_reward": 0.5467029809951782, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1471, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 499.640625, "epoch": 0.10130066753836625, "grad_norm": 1.2948042338449428, "kl": 0.470703125, "learning_rate": 9.748929424055465e-07, "loss": 0.0, "reward": 2.4515624046325684, "reward_std": 0.1773054450750351, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1472, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 486.5625, "epoch": 0.1013694859266396, "grad_norm": 1.864900231736988, "kl": 0.55078125, "learning_rate": 9.748591068571265e-07, "loss": 0.0, "reward": 2.4562501907348633, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1473, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 494.875, "epoch": 0.10143830431491295, "grad_norm": 0.0, "kl": 0.48046875, "learning_rate": 9.748252491127682e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1474, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 479.03125, "epoch": 0.10150712270318629, "grad_norm": 0.0, "kl": 0.5546875, "learning_rate": 9.747913691740542e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1475, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 446.890625, "epoch": 0.10157594109145963, "grad_norm": 0.0, "kl": 0.71875, "learning_rate": 9.747574670425678e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1476, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 461.015625, "epoch": 0.10164475947973299, "grad_norm": 1.2244157260491975, "kl": 0.486328125, "learning_rate": 9.747235427198942e-07, "loss": 0.0, "reward": 2.0850892066955566, "reward_std": 0.06330633908510208, "rewards/accuracy_reward": 0.8882143497467041, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1477, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 495.078125, "epoch": 0.10171357786800633, "grad_norm": 1.4117465430014784, "kl": 0.55859375, "learning_rate": 9.746895962076187e-07, "loss": 0.0, "reward": 1.931146502494812, "reward_std": 0.07263214886188507, "rewards/accuracy_reward": 0.7623964548110962, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1478, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 469.171875, "epoch": 0.10178239625627968, "grad_norm": 3.5248015083600426, "kl": 0.5, "learning_rate": 9.746556275073283e-07, "loss": 0.0, "reward": 2.6624999046325684, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1479, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 458.53125, "epoch": 0.10185121464455303, "grad_norm": 0.0, "kl": 0.6796875, "learning_rate": 9.746216366206105e-07, "loss": 0.0, "reward": 2.4875001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1480, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 471.859375, "epoch": 0.10192003303282637, "grad_norm": 0.8259838212563012, "kl": 0.66015625, "learning_rate": 9.745876235490542e-07, "loss": -0.0, "reward": 2.3467845916748047, "reward_std": 0.07457801699638367, "rewards/accuracy_reward": 0.737409770488739, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1481, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 478.84375, "epoch": 0.10198885142109972, "grad_norm": 2.7843403039975243, "kl": 0.640625, "learning_rate": 9.745535882942494e-07, "loss": -0.0, "reward": 2.5875000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1482, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 465.875, "epoch": 0.10205766980937306, "grad_norm": 1.9439891712474846, "kl": 0.65625, "learning_rate": 9.74519530857787e-07, "loss": -0.0, "reward": 1.8437501192092896, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1483, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 459.765625, "epoch": 0.10212648819764641, "grad_norm": 0.0, "kl": 0.51171875, "learning_rate": 9.744854512412585e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1484, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 463.046875, "epoch": 0.10219530658591976, "grad_norm": 0.0, "kl": 0.5546875, "learning_rate": 9.744513494462574e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1485, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 454.828125, "epoch": 0.1022641249741931, "grad_norm": 0.4072553144740861, "kl": 0.5703125, "learning_rate": 9.744172254743773e-07, "loss": -0.0, "reward": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1486, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 464.09375, "epoch": 0.10233294336246646, "grad_norm": 0.5689333156338465, "kl": 0.66796875, "learning_rate": 9.743830793272131e-07, "loss": 0.0, "reward": 2.2608165740966797, "reward_std": 0.004497719928622246, "rewards/accuracy_reward": 0.673316478729248, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1487, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 466.125, "epoch": 0.1024017617507398, "grad_norm": 0.9573302858687317, "kl": 0.69140625, "learning_rate": 9.743489110063614e-07, "loss": 0.0, "reward": 1.751753807067871, "reward_std": 0.08739590644836426, "rewards/accuracy_reward": 0.6080037355422974, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1488, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 444.0625, "epoch": 0.10247058013901314, "grad_norm": 0.617721964384211, "kl": 0.6796875, "learning_rate": 9.74314720513419e-07, "loss": -0.0, "reward": 2.628476619720459, "reward_std": 0.006536333821713924, "rewards/accuracy_reward": 0.9284766912460327, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1489, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 446.78125, "epoch": 0.1025393985272865, "grad_norm": 2.3513859708941314, "kl": 0.53125, "learning_rate": 9.74280507849984e-07, "loss": 0.0, "reward": 2.2540199756622314, "reward_std": 0.0171592365950346, "rewards/accuracy_reward": 0.6040199398994446, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1490, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 450.703125, "epoch": 0.10260821691555984, "grad_norm": 0.47804932924570853, "kl": 0.51953125, "learning_rate": 9.742462730176557e-07, "loss": -0.0, "reward": 2.4656577110290527, "reward_std": 0.0035714423283934593, "rewards/accuracy_reward": 0.7906577587127686, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1491, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 450.046875, "epoch": 0.10267703530383318, "grad_norm": 0.0, "kl": 0.55859375, "learning_rate": 9.742120160180343e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1492, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 433.21875, "epoch": 0.10274585369210654, "grad_norm": 0.9631739637590777, "kl": 0.52734375, "learning_rate": 9.74177736852721e-07, "loss": -0.0, "reward": 2.551544189453125, "reward_std": 0.07933984696865082, "rewards/accuracy_reward": 0.8609189391136169, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1493, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 450.546875, "epoch": 0.10281467208037988, "grad_norm": 0.5122948610514657, "kl": 0.5234375, "learning_rate": 9.74143435523318e-07, "loss": 0.0, "reward": 1.8218743801116943, "reward_std": 0.003607348073273897, "rewards/accuracy_reward": 0.6718742847442627, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1494, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 479.390625, "epoch": 0.10288349046865322, "grad_norm": 0.0, "kl": 0.6953125, "learning_rate": 9.741091120314287e-07, "loss": 0.0, "reward": 2.4875001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1495, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 477.109375, "epoch": 0.10295230885692658, "grad_norm": 0.0, "kl": 0.71484375, "learning_rate": 9.740747663786575e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1496, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.375, "epoch": 0.10302112724519992, "grad_norm": 1.0095544629131177, "kl": 0.52734375, "learning_rate": 9.740403985666095e-07, "loss": -0.0, "reward": 2.5500001907348633, "reward_std": 0.16035676002502441, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1497, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 481.015625, "epoch": 0.10308994563347326, "grad_norm": 0.5015391958825807, "kl": 0.67578125, "learning_rate": 9.740060085968915e-07, "loss": 0.0, "reward": 2.622899055480957, "reward_std": 0.0035123119596391916, "rewards/accuracy_reward": 0.9228988885879517, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1498, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 441.703125, "epoch": 0.1031587640217466, "grad_norm": 0.5209868618106454, "kl": 0.5859375, "learning_rate": 9.739715964711108e-07, "loss": -0.0, "reward": 1.9375, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1499, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 419.453125, "epoch": 0.10322758241001996, "grad_norm": 2.229887694446982, "kl": 0.53125, "learning_rate": 9.739371621908761e-07, "loss": -0.0, "reward": 2.3015241622924805, "reward_std": 0.03049800917506218, "rewards/accuracy_reward": 0.6608990430831909, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1500, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 449.3125, "epoch": 0.1032964007982933, "grad_norm": 0.0, "kl": 0.55859375, "learning_rate": 9.739027057577966e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1501, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 465.265625, "epoch": 0.10336521918656665, "grad_norm": 0.5804156787402445, "kl": 0.66015625, "learning_rate": 9.73868227173483e-07, "loss": -0.0, "reward": 2.2874999046325684, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1502, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 475.5, "epoch": 0.10343403757484, "grad_norm": 2.4890748115100063, "kl": 0.5078125, "learning_rate": 9.73833726439547e-07, "loss": -0.0, "reward": 2.326038360595703, "reward_std": 0.005589126609265804, "rewards/accuracy_reward": 0.6760381460189819, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1503, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 466.5, "epoch": 0.10350285596311334, "grad_norm": 0.0, "kl": 0.6875, "learning_rate": 9.737992035576012e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1504, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 435.078125, "epoch": 0.10357167435138669, "grad_norm": 1.796096750542402, "kl": 0.57421875, "learning_rate": 9.737646585292592e-07, "loss": 0.0, "reward": 2.0459465980529785, "reward_std": 0.16058960556983948, "rewards/accuracy_reward": 0.861571729183197, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1505, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 451.453125, "epoch": 0.10364049273966004, "grad_norm": 0.0, "kl": 0.578125, "learning_rate": 9.737300913561357e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1506, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 464.328125, "epoch": 0.10370931112793338, "grad_norm": 0.543769747987551, "kl": 0.5546875, "learning_rate": 9.736955020398464e-07, "loss": 0.0, "reward": 2.6437501907348633, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1507, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 439.078125, "epoch": 0.10377812951620673, "grad_norm": 0.47696123381909317, "kl": 0.5703125, "learning_rate": 9.736608905820083e-07, "loss": -0.0, "reward": 1.963568925857544, "reward_std": 0.00396353704854846, "rewards/accuracy_reward": 0.7885687947273254, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1508, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 450.46875, "epoch": 0.10384694790448008, "grad_norm": 0.8530927009770137, "kl": 0.52734375, "learning_rate": 9.73626256984239e-07, "loss": 0.0, "reward": 2.4802606105804443, "reward_std": 0.024477660655975342, "rewards/accuracy_reward": 0.8083856701850891, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1509, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 437.3125, "epoch": 0.10391576629275343, "grad_norm": 0.4730830580253727, "kl": 0.51953125, "learning_rate": 9.735916012481576e-07, "loss": 0.0, "reward": 2.1226534843444824, "reward_std": 0.0033116491977125406, "rewards/accuracy_reward": 0.9226534366607666, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1510, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 480.09375, "epoch": 0.10398458468102677, "grad_norm": 0.8695492185816092, "kl": 0.58203125, "learning_rate": 9.735569233753836e-07, "loss": -0.0, "reward": 2.3168773651123047, "reward_std": 0.0042109740898013115, "rewards/accuracy_reward": 0.6668775081634521, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1511, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 428.78125, "epoch": 0.10405340306930012, "grad_norm": 1.3197442750928872, "kl": 0.6640625, "learning_rate": 9.73522223367538e-07, "loss": 0.0, "reward": 1.8884594440460205, "reward_std": 0.08508006483316422, "rewards/accuracy_reward": 0.7290843725204468, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1512, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 430.4375, "epoch": 0.10412222145757347, "grad_norm": 0.5028178286055984, "kl": 0.578125, "learning_rate": 9.734875012262433e-07, "loss": 0.0, "reward": 1.8065105676651, "reward_std": 0.03080461174249649, "rewards/accuracy_reward": 0.6596355438232422, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1513, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 443.734375, "epoch": 0.10419103984584681, "grad_norm": 0.0, "kl": 0.65234375, "learning_rate": 9.73452756953122e-07, "loss": 0.0, "reward": 2.4875001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1514, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 447.65625, "epoch": 0.10425985823412015, "grad_norm": 3.595859866548783, "kl": 0.63671875, "learning_rate": 9.734179905497982e-07, "loss": 0.0, "reward": 2.4820098876953125, "reward_std": 0.0017207106575369835, "rewards/accuracy_reward": 0.8070098161697388, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1515, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 428.796875, "epoch": 0.1043286766223935, "grad_norm": 0.0, "kl": 0.578125, "learning_rate": 9.733832020178967e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1516, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 455.390625, "epoch": 0.10439749501066685, "grad_norm": 0.8076161731235101, "kl": 0.5703125, "learning_rate": 9.733483913590439e-07, "loss": 0.0, "reward": 1.8199355602264404, "reward_std": 0.003673957660794258, "rewards/accuracy_reward": 0.6699354648590088, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1517, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 437.828125, "epoch": 0.10446631339894019, "grad_norm": 0.0, "kl": 0.56640625, "learning_rate": 9.73313558574867e-07, "loss": 0.0, "reward": 2.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1518, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 438.265625, "epoch": 0.10453513178721355, "grad_norm": 0.8490732955542513, "kl": 0.625, "learning_rate": 9.73278703666994e-07, "loss": 0.0, "reward": 2.359375, "reward_std": 0.165825754404068, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.328125, "step": 1519, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 456.765625, "epoch": 0.10460395017548689, "grad_norm": 0.6406690053020312, "kl": 0.6171875, "learning_rate": 9.732438266370541e-07, "loss": 0.0, "reward": 2.4563581943511963, "reward_std": 0.07239029556512833, "rewards/accuracy_reward": 0.8876080513000488, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1520, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 435.734375, "epoch": 0.10467276856376023, "grad_norm": 0.3676641158349571, "kl": 0.5625, "learning_rate": 9.732089274866775e-07, "loss": 0.0, "reward": 1.9156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1521, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 445.34375, "epoch": 0.10474158695203359, "grad_norm": 1.6957559339065238, "kl": 0.6328125, "learning_rate": 9.731740062174956e-07, "loss": 0.0, "reward": 2.2546873092651367, "reward_std": 0.18782177567481995, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4296875, "step": 1522, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 436.140625, "epoch": 0.10481040534030693, "grad_norm": 1.0558943135789443, "kl": 0.57421875, "learning_rate": 9.731390628311406e-07, "loss": 0.0, "reward": 2.015625, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1523, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 423.296875, "epoch": 0.10487922372858027, "grad_norm": 0.5321100096763216, "kl": 0.6796875, "learning_rate": 9.731040973292457e-07, "loss": -0.0, "reward": 2.0002050399780273, "reward_std": 0.015679433941841125, "rewards/accuracy_reward": 0.8252050280570984, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1524, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 432.609375, "epoch": 0.10494804211685363, "grad_norm": 0.0, "kl": 0.5546875, "learning_rate": 9.730691097134456e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1525, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 435.578125, "epoch": 0.10501686050512697, "grad_norm": 0.0, "kl": 0.51171875, "learning_rate": 9.730340999853754e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1526, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 424.171875, "epoch": 0.10508567889340031, "grad_norm": 1.2460904828761714, "kl": 0.69140625, "learning_rate": 9.729990681466716e-07, "loss": -0.0, "reward": 2.5875000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1527, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 428.421875, "epoch": 0.10515449728167367, "grad_norm": 0.999809501365038, "kl": 0.578125, "learning_rate": 9.729640141989716e-07, "loss": 0.0, "reward": 1.915624976158142, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1528, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 423.84375, "epoch": 0.10522331566994701, "grad_norm": 0.0, "kl": 0.58203125, "learning_rate": 9.729289381439143e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1529, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 410.75, "epoch": 0.10529213405822035, "grad_norm": 1.8080557478278239, "kl": 0.53125, "learning_rate": 9.728938399831385e-07, "loss": 0.0, "reward": 2.159374952316284, "reward_std": 0.07827534526586533, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1530, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 449.125, "epoch": 0.1053609524464937, "grad_norm": 0.9544548592506112, "kl": 0.63671875, "learning_rate": 9.728587197182855e-07, "loss": 0.0, "reward": 2.5520901679992676, "reward_std": 0.056797824800014496, "rewards/accuracy_reward": 0.9177151918411255, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1531, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 458.9375, "epoch": 0.10542977083476705, "grad_norm": 1.1911842866696454, "kl": 0.515625, "learning_rate": 9.728235773509966e-07, "loss": -0.0, "reward": 2.0674514770507812, "reward_std": 0.007731719873845577, "rewards/accuracy_reward": 0.8674513101577759, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1532, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 446.5625, "epoch": 0.1054985892230404, "grad_norm": 0.5280101322033633, "kl": 0.5234375, "learning_rate": 9.727884128829142e-07, "loss": 0.0, "reward": 2.512500047683716, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1533, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 426.875, "epoch": 0.10556740761131374, "grad_norm": 0.5444214797642188, "kl": 0.69921875, "learning_rate": 9.727532263156822e-07, "loss": -0.0, "reward": 2.4727301597595215, "reward_std": 0.0034469382371753454, "rewards/accuracy_reward": 0.797730028629303, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1534, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 442.375, "epoch": 0.1056362259995871, "grad_norm": 2.271711999071627, "kl": 0.6953125, "learning_rate": 9.727180176509453e-07, "loss": 0.0, "reward": 2.484375, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1535, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 446.6875, "epoch": 0.10570504438786044, "grad_norm": 1.4208042233578762, "kl": 0.64453125, "learning_rate": 9.726827868903494e-07, "loss": 0.0, "reward": 2.4728896617889404, "reward_std": 0.025045113638043404, "rewards/accuracy_reward": 0.8057020902633667, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4921875, "step": 1536, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 453.796875, "epoch": 0.10577386277613378, "grad_norm": 0.0, "kl": 0.5546875, "learning_rate": 9.726475340355407e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1537, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 437.0625, "epoch": 0.10584268116440713, "grad_norm": 0.9424304629980699, "kl": 0.69921875, "learning_rate": 9.726122590881676e-07, "loss": 0.0, "reward": 2.3312501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1538, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 430.125, "epoch": 0.10591149955268048, "grad_norm": 0.0, "kl": 0.5703125, "learning_rate": 9.725769620498785e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1539, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 412.015625, "epoch": 0.10598031794095382, "grad_norm": 1.132834700638678, "kl": 0.7109375, "learning_rate": 9.725416429223237e-07, "loss": -0.0, "reward": 1.9681127071380615, "reward_std": 0.00597049156203866, "rewards/accuracy_reward": 0.7931126356124878, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1540, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 447.9375, "epoch": 0.10604913632922718, "grad_norm": 0.0, "kl": 0.57421875, "learning_rate": 9.725063017071536e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1541, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 453.15625, "epoch": 0.10611795471750052, "grad_norm": 0.755659705166072, "kl": 0.51953125, "learning_rate": 9.724709384060204e-07, "loss": -0.0, "reward": 2.34375, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1542, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 441.1875, "epoch": 0.10618677310577386, "grad_norm": 0.0, "kl": 0.5546875, "learning_rate": 9.72435553020577e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1543, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 453.34375, "epoch": 0.10625559149404722, "grad_norm": 0.36669109502897, "kl": 0.53515625, "learning_rate": 9.724001455524774e-07, "loss": 0.0, "reward": 2.476041555404663, "reward_std": 0.0041269585490226746, "rewards/accuracy_reward": 0.8010414838790894, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1544, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 424.625, "epoch": 0.10632440988232056, "grad_norm": 0.8449512478948251, "kl": 0.5546875, "learning_rate": 9.723647160033768e-07, "loss": -0.0, "reward": 2.1123924255371094, "reward_std": 0.0035963058471679688, "rewards/accuracy_reward": 0.912392258644104, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1545, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 418.515625, "epoch": 0.1063932282705939, "grad_norm": 1.3287210228528885, "kl": 0.5546875, "learning_rate": 9.72329264374931e-07, "loss": -0.0, "reward": 2.5875000953674316, "reward_std": 0.14961488544940948, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1546, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.4375, "epoch": 0.10646204665886724, "grad_norm": 3.2585633899206443, "kl": 0.58203125, "learning_rate": 9.72293790668797e-07, "loss": 0.0, "reward": 2.125, "reward_std": 0.1306653916835785, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1547, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 432.234375, "epoch": 0.1065308650471406, "grad_norm": 0.0, "kl": 0.5546875, "learning_rate": 9.722582948866331e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1548, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 409.84375, "epoch": 0.10659968343541394, "grad_norm": 3.368936131601648, "kl": 0.68359375, "learning_rate": 9.722227770300986e-07, "loss": 0.0, "reward": 2.315624952316284, "reward_std": 0.03718617185950279, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.421875, "step": 1549, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 437.859375, "epoch": 0.10666850182368728, "grad_norm": 0.6665561268866269, "kl": 0.578125, "learning_rate": 9.721872371008534e-07, "loss": -0.0, "reward": 1.9937500953674316, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1550, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 407.5, "epoch": 0.10673732021196064, "grad_norm": 1.335132182125381, "kl": 0.6484375, "learning_rate": 9.721516751005588e-07, "loss": 0.0, "reward": 2.3704280853271484, "reward_std": 0.08183832466602325, "rewards/accuracy_reward": 0.7173030376434326, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1551, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 421.953125, "epoch": 0.10680613860023398, "grad_norm": 1.6798638453267065, "kl": 0.5546875, "learning_rate": 9.721160910308773e-07, "loss": -0.0, "reward": 2.3039021492004395, "reward_std": 0.09743967652320862, "rewards/accuracy_reward": 0.6476520895957947, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1552, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 451.40625, "epoch": 0.10687495698850732, "grad_norm": 2.1681145908399424, "kl": 0.53515625, "learning_rate": 9.720804848934716e-07, "loss": -0.0, "reward": 2.041675329208374, "reward_std": 0.14129869639873505, "rewards/accuracy_reward": 0.8541752099990845, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1553, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 453.484375, "epoch": 0.10694377537678068, "grad_norm": 0.0, "kl": 0.546875, "learning_rate": 9.720448566900064e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1554, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 413.609375, "epoch": 0.10701259376505402, "grad_norm": 0.5517557590973702, "kl": 0.69921875, "learning_rate": 9.720092064221472e-07, "loss": 0.0, "reward": 2.546875, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1555, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 456.203125, "epoch": 0.10708141215332737, "grad_norm": 1.084695115004045, "kl": 0.55859375, "learning_rate": 9.7197353409156e-07, "loss": 0.0, "reward": 2.046875, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1556, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 452.390625, "epoch": 0.10715023054160072, "grad_norm": 0.0, "kl": 0.71875, "learning_rate": 9.719378396999125e-07, "loss": 0.0, "reward": 2.0875000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1875, "step": 1557, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 461.046875, "epoch": 0.10721904892987406, "grad_norm": 0.6502495106782055, "kl": 0.67578125, "learning_rate": 9.719021232488728e-07, "loss": -0.0, "reward": 2.2874999046325684, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1558, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 467.15625, "epoch": 0.1072878673181474, "grad_norm": 0.5338162067713459, "kl": 0.5546875, "learning_rate": 9.718663847401107e-07, "loss": 0.0, "reward": 2.1812500953674316, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1559, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 478.078125, "epoch": 0.10735668570642076, "grad_norm": 1.5850152315717383, "kl": 0.52734375, "learning_rate": 9.718306241752965e-07, "loss": 0.0, "reward": 1.828350305557251, "reward_std": 0.004452967084944248, "rewards/accuracy_reward": 0.6783503293991089, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1560, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 474.015625, "epoch": 0.1074255040946941, "grad_norm": 0.557727352550569, "kl": 0.5546875, "learning_rate": 9.717948415561019e-07, "loss": -0.0, "reward": 2.065624952316284, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1561, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 480.203125, "epoch": 0.10749432248296745, "grad_norm": 0.9782152680581673, "kl": 0.5390625, "learning_rate": 9.717590368841993e-07, "loss": -0.0, "reward": 2.419693946838379, "reward_std": 0.014404318295419216, "rewards/accuracy_reward": 0.7446936964988708, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1562, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 467.328125, "epoch": 0.10756314087124079, "grad_norm": 0.7647674819842365, "kl": 0.5390625, "learning_rate": 9.717232101612622e-07, "loss": -0.0, "reward": 2.0875000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1563, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 473.578125, "epoch": 0.10763195925951415, "grad_norm": 2.407499809626982, "kl": 0.52734375, "learning_rate": 9.716873613889653e-07, "loss": -0.0, "reward": 2.631366729736328, "reward_std": 0.002907406771555543, "rewards/accuracy_reward": 0.9313665628433228, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1564, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 494.9375, "epoch": 0.10770077764778749, "grad_norm": 1.5593439606380868, "kl": 0.515625, "learning_rate": 9.716514905689844e-07, "loss": -0.0, "reward": 2.216214179992676, "reward_std": 0.08965340256690979, "rewards/accuracy_reward": 0.5537139773368835, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1565, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 481.84375, "epoch": 0.10776959603606083, "grad_norm": 0.838297554017967, "kl": 0.546875, "learning_rate": 9.716155977029961e-07, "loss": 0.0, "reward": 2.5656251907348633, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1566, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 477.015625, "epoch": 0.10783841442433419, "grad_norm": 0.0, "kl": 0.5234375, "learning_rate": 9.715796827926781e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1567, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 467.78125, "epoch": 0.10790723281260753, "grad_norm": 0.0, "kl": 0.5546875, "learning_rate": 9.71543745839709e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1568, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.0, "completion_length": 472.5, "epoch": 0.10797605120088087, "grad_norm": 1.111661148431845, "kl": 0.546875, "learning_rate": 9.715077868457688e-07, "loss": -0.0, "reward": 2.096318244934082, "reward_std": 0.057083748281002045, "rewards/accuracy_reward": 0.4713183641433716, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1569, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 471.859375, "epoch": 0.10804486958915423, "grad_norm": 0.0, "kl": 0.7109375, "learning_rate": 9.71471805812538e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1570, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 469.171875, "epoch": 0.10811368797742757, "grad_norm": 0.7112180457844078, "kl": 0.71484375, "learning_rate": 9.714358027416989e-07, "loss": 0.0, "reward": 2.0187501907348633, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0625, "step": 1571, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 478.0, "epoch": 0.10818250636570091, "grad_norm": 1.3880671074051876, "kl": 0.54296875, "learning_rate": 9.713997776349338e-07, "loss": 0.0, "reward": 2.045686960220337, "reward_std": 0.08356797695159912, "rewards/accuracy_reward": 0.8581868410110474, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1572, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 471.265625, "epoch": 0.10825132475397427, "grad_norm": 0.0, "kl": 0.6953125, "learning_rate": 9.713637304939272e-07, "loss": 0.0, "reward": 2.1125001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0625, "step": 1573, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 472.796875, "epoch": 0.10832014314224761, "grad_norm": 0.0, "kl": 0.6875, "learning_rate": 9.713276613203635e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1574, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 442.96875, "epoch": 0.10838896153052095, "grad_norm": 0.0, "kl": 0.69140625, "learning_rate": 9.712915701159288e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1575, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 452.140625, "epoch": 0.10845777991879431, "grad_norm": 0.0, "kl": 0.5703125, "learning_rate": 9.712554568823103e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1576, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 455.75, "epoch": 0.10852659830706765, "grad_norm": 0.7496251056318033, "kl": 0.5703125, "learning_rate": 9.712193216211958e-07, "loss": 0.0, "reward": 2.1199545860290527, "reward_std": 0.003919167444109917, "rewards/accuracy_reward": 0.9199547171592712, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1577, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 474.515625, "epoch": 0.108595416695341, "grad_norm": 0.5583300639641365, "kl": 0.69140625, "learning_rate": 9.711831643342742e-07, "loss": -0.0, "reward": 1.9156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1578, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 428.34375, "epoch": 0.10866423508361434, "grad_norm": 1.902557149552754, "kl": 0.609375, "learning_rate": 9.711469850232358e-07, "loss": 0.0, "reward": 2.1624999046325684, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1579, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 466.953125, "epoch": 0.10873305347188769, "grad_norm": 0.5787019647375967, "kl": 0.54296875, "learning_rate": 9.71110783689772e-07, "loss": -0.0, "reward": 2.6223559379577637, "reward_std": 0.004612878896296024, "rewards/accuracy_reward": 0.9223558306694031, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1580, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 473.984375, "epoch": 0.10880187186016103, "grad_norm": 0.6497116776271633, "kl": 0.703125, "learning_rate": 9.71074560335574e-07, "loss": -0.0, "reward": 1.7066786289215088, "reward_std": 0.08708713948726654, "rewards/accuracy_reward": 0.5285536646842957, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0625, "step": 1581, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 465.5625, "epoch": 0.10887069024843438, "grad_norm": 1.0690463642907362, "kl": 0.6875, "learning_rate": 9.71038314962336e-07, "loss": -0.0, "reward": 2.464489698410034, "reward_std": 0.08902537822723389, "rewards/accuracy_reward": 0.7957398295402527, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1582, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 455.953125, "epoch": 0.10893950863670773, "grad_norm": 1.0917000307289593, "kl": 0.57421875, "learning_rate": 9.710020475717515e-07, "loss": -0.0, "reward": 1.9937500953674316, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1583, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 438.84375, "epoch": 0.10900832702498107, "grad_norm": 0.7999377153969197, "kl": 0.56640625, "learning_rate": 9.70965758165516e-07, "loss": 0.0, "reward": 2.012500047683716, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1584, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 453.65625, "epoch": 0.10907714541325442, "grad_norm": 1.5903933614867196, "kl": 0.5546875, "learning_rate": 9.709294467453257e-07, "loss": -0.0, "reward": 2.4604415893554688, "reward_std": 0.07755933701992035, "rewards/accuracy_reward": 0.7791916728019714, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1585, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 489.4375, "epoch": 0.10914596380152777, "grad_norm": 1.091304482717516, "kl": 0.6171875, "learning_rate": 9.708931133128777e-07, "loss": 0.0, "reward": 2.2951505184173584, "reward_std": 0.0841849073767662, "rewards/accuracy_reward": 0.6357755064964294, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1586, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 467.09375, "epoch": 0.10921478218980112, "grad_norm": 2.824537477526332, "kl": 0.578125, "learning_rate": 9.708567578698705e-07, "loss": 0.0, "reward": 1.7469083070755005, "reward_std": 0.08267681300640106, "rewards/accuracy_reward": 0.6094082593917847, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1587, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 465.15625, "epoch": 0.10928360057807446, "grad_norm": 5.539398690672906, "kl": 0.5703125, "learning_rate": 9.708203804180035e-07, "loss": 0.0, "reward": 2.3968749046325684, "reward_std": 0.2361002117395401, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1588, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 464.0, "epoch": 0.10935241896634781, "grad_norm": 0.40433899698264036, "kl": 0.578125, "learning_rate": 9.70783980958977e-07, "loss": 0.0, "reward": 2.3812499046325684, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1589, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 470.15625, "epoch": 0.10942123735462116, "grad_norm": 2.318032577427807, "kl": 0.7265625, "learning_rate": 9.707475594944922e-07, "loss": -0.0, "reward": 1.90252685546875, "reward_std": 0.09663686901330948, "rewards/accuracy_reward": 0.7478393316268921, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0078125, "step": 1590, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 447.734375, "epoch": 0.1094900557428945, "grad_norm": 0.0, "kl": 0.56640625, "learning_rate": 9.707111160262518e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1591, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 435.4375, "epoch": 0.10955887413116785, "grad_norm": 1.516448074244488, "kl": 0.671875, "learning_rate": 9.70674650555959e-07, "loss": 0.0, "reward": 2.3184332847595215, "reward_std": 0.04708680883049965, "rewards/accuracy_reward": 0.668433427810669, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1592, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 518.9375, "epoch": 0.1096276925194412, "grad_norm": 0.41377483815728694, "kl": 0.625, "learning_rate": 9.706381630853184e-07, "loss": -0.0, "reward": 2.447417974472046, "reward_std": 0.06974273175001144, "rewards/accuracy_reward": 0.7911679744720459, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 1593, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 449.46875, "epoch": 0.10969651090771454, "grad_norm": 1.6694390563058081, "kl": 0.56640625, "learning_rate": 9.706016536160355e-07, "loss": 0.0, "reward": 1.8644341230392456, "reward_std": 0.07408708333969116, "rewards/accuracy_reward": 0.7081841230392456, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1594, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 473.390625, "epoch": 0.10976532929598788, "grad_norm": 0.0, "kl": 0.5703125, "learning_rate": 9.705651221498168e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1595, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 476.84375, "epoch": 0.10983414768426124, "grad_norm": 0.0, "kl": 0.56640625, "learning_rate": 9.705285686883696e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1596, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 522.71875, "epoch": 0.10990296607253458, "grad_norm": 1.7395557486775597, "kl": 0.52734375, "learning_rate": 9.704919932334032e-07, "loss": -0.0, "reward": 2.236896514892578, "reward_std": 0.0065400791354477406, "rewards/accuracy_reward": 0.5868964195251465, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1597, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 452.03125, "epoch": 0.10997178446080792, "grad_norm": 7.323091490846556, "kl": 0.55859375, "learning_rate": 9.704553957866265e-07, "loss": -0.0, "reward": 1.8364417552947998, "reward_std": 0.03430069237947464, "rewards/accuracy_reward": 0.664566695690155, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1598, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 472.8125, "epoch": 0.11004060284908128, "grad_norm": 0.8001161186373138, "kl": 0.5625, "learning_rate": 9.704187763497507e-07, "loss": -0.0, "reward": 1.9335213899612427, "reward_std": 0.07371598482131958, "rewards/accuracy_reward": 0.7647714018821716, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1599, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 497.390625, "epoch": 0.11010942123735462, "grad_norm": 1.6953346105816927, "kl": 0.55078125, "learning_rate": 9.70382134924487e-07, "loss": -0.0, "reward": 1.7347991466522217, "reward_std": 0.07807404547929764, "rewards/accuracy_reward": 0.5785491466522217, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1600, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 517.671875, "epoch": 0.11017823962562796, "grad_norm": 0.0, "kl": 0.54296875, "learning_rate": 9.703454715125483e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1601, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 542.375, "epoch": 0.11024705801390132, "grad_norm": 0.0, "kl": 0.67578125, "learning_rate": 9.703087861156483e-07, "loss": 0.0, "reward": 2.4250001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1602, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 519.46875, "epoch": 0.11031587640217466, "grad_norm": 0.5665351162998411, "kl": 0.55859375, "learning_rate": 9.702720787355016e-07, "loss": 0.0, "reward": 2.2078123092651367, "reward_std": 0.11932426691055298, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1603, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 540.265625, "epoch": 0.110384694790448, "grad_norm": 1.5940244881154437, "kl": 0.6640625, "learning_rate": 9.702353493738245e-07, "loss": 0.0, "reward": 2.2515625953674316, "reward_std": 0.2771350145339966, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.3671875, "step": 1604, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 471.5625, "epoch": 0.11045351317872136, "grad_norm": 0.7929951627612907, "kl": 0.57421875, "learning_rate": 9.701985980323332e-07, "loss": -0.0, "reward": 1.7081387042999268, "reward_std": 0.011846387758851051, "rewards/accuracy_reward": 0.5581386089324951, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1605, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 520.21875, "epoch": 0.1105223315669947, "grad_norm": 1.0662286608984632, "kl": 0.52734375, "learning_rate": 9.701618247127459e-07, "loss": 0.0, "reward": 1.90245521068573, "reward_std": 0.029691580682992935, "rewards/accuracy_reward": 0.7305801510810852, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1606, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 488.046875, "epoch": 0.11059114995526804, "grad_norm": 1.6502130922376714, "kl": 0.55859375, "learning_rate": 9.701250294167814e-07, "loss": 0.0, "reward": 2.111948013305664, "reward_std": 0.07317991554737091, "rewards/accuracy_reward": 0.9150729775428772, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1607, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 523.9375, "epoch": 0.1106599683435414, "grad_norm": 0.4835992079123175, "kl": 0.52734375, "learning_rate": 9.700882121461596e-07, "loss": -0.0, "reward": 2.125460624694824, "reward_std": 0.004502836614847183, "rewards/accuracy_reward": 0.9254605770111084, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1608, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 485.390625, "epoch": 0.11072878673181474, "grad_norm": 0.36729653077514546, "kl": 0.5625, "learning_rate": 9.70051372902601e-07, "loss": 0.0, "reward": 2.111149311065674, "reward_std": 0.0077496664598584175, "rewards/accuracy_reward": 0.9111493825912476, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1609, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 499.8125, "epoch": 0.11079760512008809, "grad_norm": 0.4485855572531582, "kl": 0.6640625, "learning_rate": 9.700145116878285e-07, "loss": -0.0, "reward": 2.065624952316284, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1610, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 504.140625, "epoch": 0.11086642350836143, "grad_norm": 0.9029168241152454, "kl": 0.55859375, "learning_rate": 9.69977628503564e-07, "loss": -0.0, "reward": 2.1290907859802246, "reward_std": 0.001455005258321762, "rewards/accuracy_reward": 0.9290909767150879, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1611, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 481.046875, "epoch": 0.11093524189663478, "grad_norm": 0.9639506265593353, "kl": 0.57421875, "learning_rate": 9.699407233515323e-07, "loss": 0.0, "reward": 1.9756982326507568, "reward_std": 0.002744432305917144, "rewards/accuracy_reward": 0.8006982803344727, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1612, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 588.828125, "epoch": 0.11100406028490813, "grad_norm": 1.1912814988754559, "kl": 0.5859375, "learning_rate": 9.69903796233458e-07, "loss": -0.0, "reward": 1.746509075164795, "reward_std": 0.01581439934670925, "rewards/accuracy_reward": 0.5715090036392212, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1613, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 507.734375, "epoch": 0.11107287867318147, "grad_norm": 0.6272572975532322, "kl": 0.55078125, "learning_rate": 9.698668471510674e-07, "loss": 0.0, "reward": 2.2312498092651367, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1614, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 497.859375, "epoch": 0.11114169706145482, "grad_norm": 1.0408847640521408, "kl": 0.56640625, "learning_rate": 9.698298761060872e-07, "loss": 0.0, "reward": 2.049999952316284, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1615, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 488.328125, "epoch": 0.11121051544972817, "grad_norm": 0.9877074727542258, "kl": 0.55078125, "learning_rate": 9.69792883100246e-07, "loss": 0.0, "reward": 1.9761226177215576, "reward_std": 0.0033387471921741962, "rewards/accuracy_reward": 0.8011225461959839, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1616, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 479.625, "epoch": 0.11127933383800151, "grad_norm": 0.7950174794457358, "kl": 0.5703125, "learning_rate": 9.697558681352724e-07, "loss": -0.0, "reward": 2.5875000953674316, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1617, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 484.4375, "epoch": 0.11134815222627487, "grad_norm": 1.1003028805210289, "kl": 0.68359375, "learning_rate": 9.697188312128973e-07, "loss": 0.0, "reward": 2.5242252349853516, "reward_std": 0.054919250309467316, "rewards/accuracy_reward": 0.8898503184318542, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1618, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 480.0625, "epoch": 0.11141697061454821, "grad_norm": 0.0, "kl": 0.6796875, "learning_rate": 9.696817723348511e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1619, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 524.375, "epoch": 0.11148578900282155, "grad_norm": 0.6014725585521153, "kl": 0.64453125, "learning_rate": 9.696446915028665e-07, "loss": -0.0, "reward": 1.830640435218811, "reward_std": 0.13746172189712524, "rewards/accuracy_reward": 0.69314044713974, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1620, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 502.875, "epoch": 0.1115546073910949, "grad_norm": 0.0, "kl": 0.66796875, "learning_rate": 9.696075887186763e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1621, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 493.78125, "epoch": 0.11162342577936825, "grad_norm": 0.7168369646379458, "kl": 0.66796875, "learning_rate": 9.695704639840152e-07, "loss": 0.0, "reward": 2.4117989540100098, "reward_std": 0.01503645908087492, "rewards/accuracy_reward": 0.7992990016937256, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1622, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 520.671875, "epoch": 0.11169224416764159, "grad_norm": 0.6615249727992637, "kl": 0.51171875, "learning_rate": 9.695333173006184e-07, "loss": -0.0, "reward": 1.4912641048431396, "reward_std": 0.072749562561512, "rewards/accuracy_reward": 0.39751407504081726, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1623, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 473.90625, "epoch": 0.11176106255591495, "grad_norm": 0.0, "kl": 0.55859375, "learning_rate": 9.694961486702222e-07, "loss": 0.0, "reward": 1.7935185432434082, "reward_std": 0.0, "rewards/accuracy_reward": 0.6435185074806213, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1624, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 475.6875, "epoch": 0.11182988094418829, "grad_norm": 1.2931143581554718, "kl": 0.5703125, "learning_rate": 9.69458958094564e-07, "loss": 0.0, "reward": 1.662520170211792, "reward_std": 0.16604267060756683, "rewards/accuracy_reward": 0.5187700986862183, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1625, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 485.78125, "epoch": 0.11189869933246163, "grad_norm": 0.0, "kl": 0.5390625, "learning_rate": 9.694217455753818e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1626, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 475.21875, "epoch": 0.11196751772073497, "grad_norm": 0.0, "kl": 0.57421875, "learning_rate": 9.693845111144153e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1627, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 509.578125, "epoch": 0.11203633610900833, "grad_norm": 0.0, "kl": 0.51953125, "learning_rate": 9.69347254713405e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1628, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 539.9375, "epoch": 0.11210515449728167, "grad_norm": 0.0, "kl": 0.52734375, "learning_rate": 9.693099763740923e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1629, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 520.390625, "epoch": 0.11217397288555501, "grad_norm": 0.376701375665696, "kl": 0.52734375, "learning_rate": 9.692726760982194e-07, "loss": -0.0, "reward": 1.9735954999923706, "reward_std": 0.002680320292711258, "rewards/accuracy_reward": 0.7985953688621521, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1630, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 480.8125, "epoch": 0.11224279127382837, "grad_norm": 0.41941314278528163, "kl": 0.5703125, "learning_rate": 9.6923535388753e-07, "loss": 0.0, "reward": 2.1401376724243164, "reward_std": 0.005955354776233435, "rewards/accuracy_reward": 0.940137505531311, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1631, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 484.203125, "epoch": 0.11231160966210171, "grad_norm": 2.2774896077251565, "kl": 0.5625, "learning_rate": 9.691980097437687e-07, "loss": 0.0, "reward": 1.7896732091903687, "reward_std": 0.01531824842095375, "rewards/accuracy_reward": 0.6146731972694397, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1632, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 536.3125, "epoch": 0.11238042805037506, "grad_norm": 3.0528985108867945, "kl": 0.5390625, "learning_rate": 9.691606436686809e-07, "loss": 0.0, "reward": 2.2821898460388184, "reward_std": 0.01735665090382099, "rewards/accuracy_reward": 0.6321898698806763, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1633, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 526.765625, "epoch": 0.11244924643864841, "grad_norm": 0.805757105929559, "kl": 0.56640625, "learning_rate": 9.69123255664013e-07, "loss": -0.0, "reward": 2.132762908935547, "reward_std": 0.0035057400818914175, "rewards/accuracy_reward": 0.9327628016471863, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1634, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 518.265625, "epoch": 0.11251806482692175, "grad_norm": 0.8450352141666582, "kl": 0.5703125, "learning_rate": 9.690858457315133e-07, "loss": -0.0, "reward": 2.5656251907348633, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1635, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 514.84375, "epoch": 0.1125868832151951, "grad_norm": 5.435865971206027, "kl": 0.5859375, "learning_rate": 9.690484138729296e-07, "loss": 0.0, "reward": 2.4890623092651367, "reward_std": 0.17235727608203888, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1636, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 495.65625, "epoch": 0.11265570160346845, "grad_norm": 0.7030720055609422, "kl": 0.5703125, "learning_rate": 9.69010960090012e-07, "loss": -0.0, "reward": 1.8239250183105469, "reward_std": 0.08470042049884796, "rewards/accuracy_reward": 0.6801749467849731, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1637, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 498.875, "epoch": 0.1127245199917418, "grad_norm": 1.1429488684113305, "kl": 0.59375, "learning_rate": 9.68973484384511e-07, "loss": -0.0, "reward": 1.9375, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1638, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 553.125, "epoch": 0.11279333838001514, "grad_norm": 0.5148113216297374, "kl": 0.5546875, "learning_rate": 9.689359867581784e-07, "loss": -0.0, "reward": 1.9506287574768066, "reward_std": 0.07100702822208405, "rewards/accuracy_reward": 0.7943786382675171, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1639, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 516.375, "epoch": 0.1128621567682885, "grad_norm": 0.0, "kl": 0.65625, "learning_rate": 9.688984672127668e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1640, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 486.59375, "epoch": 0.11293097515656184, "grad_norm": 1.3795974833106632, "kl": 0.6328125, "learning_rate": 9.6886092575003e-07, "loss": 0.0, "reward": 1.931249976158142, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1641, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 504.328125, "epoch": 0.11299979354483518, "grad_norm": 1.1183456715657039, "kl": 0.67578125, "learning_rate": 9.68823362371723e-07, "loss": 0.0, "reward": 2.4718751907348633, "reward_std": 0.0289318785071373, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.421875, "step": 1642, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 539.34375, "epoch": 0.11306861193310852, "grad_norm": 0.8909394454947919, "kl": 0.5859375, "learning_rate": 9.68785777079601e-07, "loss": 0.0, "reward": 2.1815853118896484, "reward_std": 0.16639748215675354, "rewards/accuracy_reward": 0.5878352522850037, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.5, "step": 1643, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 517.125, "epoch": 0.11313743032138188, "grad_norm": 1.026254288884293, "kl": 0.59765625, "learning_rate": 9.687481698754215e-07, "loss": -0.0, "reward": 1.975000023841858, "reward_std": 0.08017837256193161, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1644, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 490.578125, "epoch": 0.11320624870965522, "grad_norm": 1.3023371111243633, "kl": 0.6015625, "learning_rate": 9.68710540760942e-07, "loss": 0.0, "reward": 2.019524574279785, "reward_std": 0.012365912087261677, "rewards/accuracy_reward": 0.8195246458053589, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1645, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 506.890625, "epoch": 0.11327506709792856, "grad_norm": 0.0, "kl": 0.671875, "learning_rate": 9.686728897379215e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1646, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 447.890625, "epoch": 0.11334388548620192, "grad_norm": 0.0, "kl": 0.6640625, "learning_rate": 9.686352168081196e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1647, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 458.03125, "epoch": 0.11341270387447526, "grad_norm": 0.8750719027653808, "kl": 0.65234375, "learning_rate": 9.685975219732975e-07, "loss": -0.0, "reward": 1.829695463180542, "reward_std": 0.05809289589524269, "rewards/accuracy_reward": 0.6828204393386841, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1648, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 471.328125, "epoch": 0.1134815222627486, "grad_norm": 2.350960594504184, "kl": 0.60546875, "learning_rate": 9.685598052352172e-07, "loss": -0.0, "reward": 1.7390295267105103, "reward_std": 0.006794001907110214, "rewards/accuracy_reward": 0.5890294313430786, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1649, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 491.5, "epoch": 0.11355034065102196, "grad_norm": 0.744529512811468, "kl": 0.6171875, "learning_rate": 9.685220665956411e-07, "loss": 0.0, "reward": 2.399972677230835, "reward_std": 0.08326073735952377, "rewards/accuracy_reward": 0.737472653388977, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1650, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 483.859375, "epoch": 0.1136191590392953, "grad_norm": 0.6321557767301771, "kl": 0.60546875, "learning_rate": 9.68484306056334e-07, "loss": -0.0, "reward": 1.903720498085022, "reward_std": 0.0075731766410171986, "rewards/accuracy_reward": 0.728720486164093, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1651, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 453.53125, "epoch": 0.11368797742756864, "grad_norm": 0.5515169219977684, "kl": 0.625, "learning_rate": 9.684465236190602e-07, "loss": -0.0, "reward": 1.8234926462173462, "reward_std": 0.0032860380597412586, "rewards/accuracy_reward": 0.6734925508499146, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1652, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 423.015625, "epoch": 0.113756795815842, "grad_norm": 0.5337555518497895, "kl": 0.65625, "learning_rate": 9.684087192855861e-07, "loss": 0.0, "reward": 2.1241235733032227, "reward_std": 0.015899639576673508, "rewards/accuracy_reward": 0.9241236448287964, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1653, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 426.234375, "epoch": 0.11382561420411534, "grad_norm": 0.0, "kl": 0.6875, "learning_rate": 9.683708930576788e-07, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1654, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 475.890625, "epoch": 0.11389443259238868, "grad_norm": 1.9833333518463248, "kl": 0.625, "learning_rate": 9.683330449371061e-07, "loss": 0.0, "reward": 2.6260170936584473, "reward_std": 0.0037480120081454515, "rewards/accuracy_reward": 0.9260169267654419, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1655, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 433.546875, "epoch": 0.11396325098066204, "grad_norm": 0.0, "kl": 0.625, "learning_rate": 9.682951749256376e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1656, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 482.0, "epoch": 0.11403206936893538, "grad_norm": 0.0, "kl": 0.66796875, "learning_rate": 9.682572830250428e-07, "loss": 0.0, "reward": 2.3375000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1657, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 472.59375, "epoch": 0.11410088775720872, "grad_norm": 0.6815220033015713, "kl": 0.63671875, "learning_rate": 9.682193692370932e-07, "loss": 0.0, "reward": 2.4484376907348633, "reward_std": 0.03234682232141495, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3984375, "step": 1658, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 469.3125, "epoch": 0.11416970614548207, "grad_norm": 1.361126132409192, "kl": 0.66796875, "learning_rate": 9.681814335635609e-07, "loss": -0.0, "reward": 1.803125023841858, "reward_std": 0.11363068222999573, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1659, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 462.0, "epoch": 0.11423852453375542, "grad_norm": 0.9595452561962255, "kl": 0.73046875, "learning_rate": 9.681434760062192e-07, "loss": -0.0, "reward": 2.5155861377716064, "reward_std": 0.024152535945177078, "rewards/accuracy_reward": 0.8155860900878906, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1660, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 438.1875, "epoch": 0.11430734292202877, "grad_norm": 0.6321448770721623, "kl": 0.61328125, "learning_rate": 9.681054965668422e-07, "loss": 0.0, "reward": 2.2078123092651367, "reward_std": 0.11932426691055298, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1661, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 431.671875, "epoch": 0.11437616131030211, "grad_norm": 24.07280298813696, "kl": 0.66015625, "learning_rate": 9.680674952472053e-07, "loss": 0.0, "reward": 2.046875, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1662, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 445.34375, "epoch": 0.11444497969857546, "grad_norm": 0.5370178491546539, "kl": 0.6171875, "learning_rate": 9.680294720490846e-07, "loss": 0.0, "reward": 1.982120156288147, "reward_std": 0.002109918277710676, "rewards/accuracy_reward": 0.8071200847625732, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1663, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 392.609375, "epoch": 0.1145137980868488, "grad_norm": 1.2312562107583997, "kl": 0.703125, "learning_rate": 9.679914269742572e-07, "loss": 0.0, "reward": 2.453125, "reward_std": 0.18646937608718872, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1664, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 458.703125, "epoch": 0.11458261647512215, "grad_norm": 0.7028739915916197, "kl": 0.75390625, "learning_rate": 9.679533600245018e-07, "loss": 0.0, "reward": 2.034374952316284, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1665, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 437.921875, "epoch": 0.1146514348633955, "grad_norm": 1.6658373316343047, "kl": 0.65625, "learning_rate": 9.679152712015978e-07, "loss": 0.0, "reward": 2.1624999046325684, "reward_std": 0.1060660183429718, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1666, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 445.984375, "epoch": 0.11472025325166885, "grad_norm": 0.0, "kl": 0.7890625, "learning_rate": 9.67877160507325e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1667, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 427.921875, "epoch": 0.11478907163994219, "grad_norm": 0.6571220878993329, "kl": 0.65234375, "learning_rate": 9.67839027943465e-07, "loss": -0.0, "reward": 1.9376097917556763, "reward_std": 0.010450000874698162, "rewards/accuracy_reward": 0.7657347321510315, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1668, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 442.40625, "epoch": 0.11485789002821555, "grad_norm": 0.9410505076689796, "kl": 0.78515625, "learning_rate": 9.678008735118003e-07, "loss": 0.0, "reward": 2.371875047683716, "reward_std": 0.07827534526586533, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1669, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 457.046875, "epoch": 0.11492670841648889, "grad_norm": 0.9567464573507299, "kl": 0.625, "learning_rate": 9.677626972141147e-07, "loss": 0.0, "reward": 1.6659011840820312, "reward_std": 0.08747697621583939, "rewards/accuracy_reward": 0.5284010767936707, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1670, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 449.5, "epoch": 0.11499552680476223, "grad_norm": 0.0, "kl": 0.8125, "learning_rate": 9.67724499052192e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1671, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 460.421875, "epoch": 0.11506434519303559, "grad_norm": 0.0, "kl": 0.7890625, "learning_rate": 9.67686279027818e-07, "loss": 0.0, "reward": 2.3000001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.25, "step": 1672, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 453.265625, "epoch": 0.11513316358130893, "grad_norm": 0.5308544223958409, "kl": 0.6171875, "learning_rate": 9.676480371427788e-07, "loss": -0.0, "reward": 1.9230988025665283, "reward_std": 0.09143630415201187, "rewards/accuracy_reward": 0.7855987548828125, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 1673, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 417.46875, "epoch": 0.11520198196958227, "grad_norm": 1.323712503465163, "kl": 0.640625, "learning_rate": 9.676097733988626e-07, "loss": 0.0, "reward": 1.9493799209594727, "reward_std": 0.07410258054733276, "rewards/accuracy_reward": 0.7806298732757568, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1674, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 439.046875, "epoch": 0.11527080035785561, "grad_norm": 0.6914893198890656, "kl": 0.8125, "learning_rate": 9.675714877978573e-07, "loss": -0.0, "reward": 2.2093749046325684, "reward_std": 0.12182655930519104, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1675, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 432.484375, "epoch": 0.11533961874612897, "grad_norm": 0.0, "kl": 0.6171875, "learning_rate": 9.67533180341553e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1676, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 416.140625, "epoch": 0.11540843713440231, "grad_norm": 1.3577668359065302, "kl": 0.734375, "learning_rate": 9.674948510317397e-07, "loss": 0.0, "reward": 2.528162956237793, "reward_std": 0.08092956244945526, "rewards/accuracy_reward": 0.8437881469726562, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1677, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 439.234375, "epoch": 0.11547725552267565, "grad_norm": 0.8629426687127404, "kl": 0.62890625, "learning_rate": 9.674564998702094e-07, "loss": -0.0, "reward": 1.9156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1678, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 436.640625, "epoch": 0.11554607391094901, "grad_norm": 0.0, "kl": 0.61328125, "learning_rate": 9.674181268587546e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1679, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 439.296875, "epoch": 0.11561489229922235, "grad_norm": 2.8802499440131974, "kl": 0.78515625, "learning_rate": 9.673797319991686e-07, "loss": -0.0, "reward": 2.2203125953674316, "reward_std": 0.12073516845703125, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.2890625, "step": 1680, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 405.0, "epoch": 0.1156837106874957, "grad_norm": 2.1536595088299757, "kl": 0.63671875, "learning_rate": 9.673413152932465e-07, "loss": -0.0, "reward": 2.164097785949707, "reward_std": 0.017864786088466644, "rewards/accuracy_reward": 0.9672226905822754, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1681, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 427.28125, "epoch": 0.11575252907576905, "grad_norm": 0.7236800984292008, "kl": 0.63671875, "learning_rate": 9.67302876742784e-07, "loss": -0.0, "reward": 2.1437501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1682, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 437.296875, "epoch": 0.11582134746404239, "grad_norm": 0.45397597332554834, "kl": 0.57421875, "learning_rate": 9.672644163495776e-07, "loss": 0.0, "reward": 1.6750000715255737, "reward_std": 0.08017838001251221, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1683, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 443.734375, "epoch": 0.11589016585231574, "grad_norm": 1.4618180214287642, "kl": 0.58984375, "learning_rate": 9.672259341154253e-07, "loss": 0.0, "reward": 2.1968750953674316, "reward_std": 0.008838837966322899, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1684, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 464.59375, "epoch": 0.11595898424058909, "grad_norm": 1.0372615325027776, "kl": 0.5703125, "learning_rate": 9.671874300421252e-07, "loss": 0.0, "reward": 2.1797609329223633, "reward_std": 0.019058892503380775, "rewards/accuracy_reward": 0.5360110998153687, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1685, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 448.59375, "epoch": 0.11602780262886243, "grad_norm": 0.5000909740045515, "kl": 0.58984375, "learning_rate": 9.671489041314777e-07, "loss": 0.0, "reward": 2.512500047683716, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1686, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 469.84375, "epoch": 0.11609662101713578, "grad_norm": 0.0, "kl": 0.5625, "learning_rate": 9.671103563852834e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1687, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 455.3125, "epoch": 0.11616543940540913, "grad_norm": 2.5004015164998954, "kl": 0.6953125, "learning_rate": 9.67071786805344e-07, "loss": 0.0, "reward": 2.606250286102295, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1688, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 467.28125, "epoch": 0.11623425779368247, "grad_norm": 1.8343095308909423, "kl": 0.5546875, "learning_rate": 9.670331953934625e-07, "loss": 0.0, "reward": 2.040196418762207, "reward_std": 0.11917901039123535, "rewards/accuracy_reward": 0.8589465618133545, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1689, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 482.03125, "epoch": 0.11630307618195582, "grad_norm": 0.6865715965079643, "kl": 0.5703125, "learning_rate": 9.669945821514426e-07, "loss": 0.0, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1690, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 536.0, "epoch": 0.11637189457022916, "grad_norm": 1.078503936368403, "kl": 0.5078125, "learning_rate": 9.66955947081089e-07, "loss": 0.0, "reward": 1.9721002578735352, "reward_std": 0.011187081225216389, "rewards/accuracy_reward": 0.7721002697944641, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1691, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 544.265625, "epoch": 0.11644071295850252, "grad_norm": 1.6918049080339859, "kl": 0.67578125, "learning_rate": 9.66917290184208e-07, "loss": 0.0, "reward": 2.1128005981445312, "reward_std": 0.14025643467903137, "rewards/accuracy_reward": 0.6081129312515259, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.3671875, "step": 1692, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 510.640625, "epoch": 0.11650953134677586, "grad_norm": 1.2928199239294735, "kl": 0.62109375, "learning_rate": 9.668786114626062e-07, "loss": 0.0, "reward": 1.8953876495361328, "reward_std": 0.09475716948509216, "rewards/accuracy_reward": 0.7360126376152039, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1693, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 539.640625, "epoch": 0.1165783497350492, "grad_norm": 1.2999262080467093, "kl": 0.58984375, "learning_rate": 9.668399109180917e-07, "loss": 0.0, "reward": 2.473437547683716, "reward_std": 0.21655145287513733, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.4921875, "step": 1694, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 521.25, "epoch": 0.11664716812332256, "grad_norm": 0.6253487602224549, "kl": 0.546875, "learning_rate": 9.668011885524735e-07, "loss": 0.0, "reward": 1.8732173442840576, "reward_std": 0.07215812802314758, "rewards/accuracy_reward": 0.7169672846794128, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1695, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 500.328125, "epoch": 0.1167159865115959, "grad_norm": 0.8684624904180943, "kl": 0.7734375, "learning_rate": 9.667624443675612e-07, "loss": 0.0, "reward": 2.46875, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1696, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 504.421875, "epoch": 0.11678480489986924, "grad_norm": 0.0, "kl": 0.78125, "learning_rate": 9.667236783651662e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1697, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 484.984375, "epoch": 0.1168536232881426, "grad_norm": 0.0, "kl": 0.625, "learning_rate": 9.666848905471003e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1698, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 440.03125, "epoch": 0.11692244167641594, "grad_norm": 1.4719350387768482, "kl": 0.703125, "learning_rate": 9.666460809151766e-07, "loss": -0.0, "reward": 2.1814403533935547, "reward_std": 0.011587508022785187, "rewards/accuracy_reward": 0.5314404964447021, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1699, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 473.203125, "epoch": 0.11699126006468928, "grad_norm": 1.3881531389578516, "kl": 0.6484375, "learning_rate": 9.66607249471209e-07, "loss": -0.0, "reward": 2.4351630210876465, "reward_std": 0.09217125922441483, "rewards/accuracy_reward": 0.7757877707481384, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1700, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 516.890625, "epoch": 0.11706007845296264, "grad_norm": 0.9353750921630922, "kl": 0.625, "learning_rate": 9.665683962170127e-07, "loss": -0.0, "reward": 1.9668530225753784, "reward_std": 0.008375044912099838, "rewards/accuracy_reward": 0.7918530702590942, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1701, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 535.375, "epoch": 0.11712889684123598, "grad_norm": 8.993577441928744, "kl": 0.6015625, "learning_rate": 9.665295211544038e-07, "loss": -0.0, "reward": 2.237626552581787, "reward_std": 0.08103957027196884, "rewards/accuracy_reward": 0.603251576423645, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1702, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 490.375, "epoch": 0.11719771522950932, "grad_norm": 0.0, "kl": 0.796875, "learning_rate": 9.664906242851994e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1703, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 521.890625, "epoch": 0.11726653361778268, "grad_norm": 2.115145358358944, "kl": 0.640625, "learning_rate": 9.664517056112175e-07, "loss": 0.0, "reward": 2.074744939804077, "reward_std": 0.08587007969617844, "rewards/accuracy_reward": 0.8872449994087219, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1704, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 516.828125, "epoch": 0.11733535200605602, "grad_norm": 0.7064932167732313, "kl": 0.73046875, "learning_rate": 9.664127651342774e-07, "loss": 0.0, "reward": 1.755047082901001, "reward_std": 0.17949557304382324, "rewards/accuracy_reward": 0.6894220113754272, "rewards/format_reward": 0.921875, "rewards/transform_reward": 0.0, "step": 1705, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 475.703125, "epoch": 0.11740417039432936, "grad_norm": 0.0, "kl": 0.64453125, "learning_rate": 9.663738028561991e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1706, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 491.3125, "epoch": 0.1174729887826027, "grad_norm": 1.451465851747237, "kl": 0.62890625, "learning_rate": 9.66334818778804e-07, "loss": 0.0, "reward": 2.0249106884002686, "reward_std": 0.08052445203065872, "rewards/accuracy_reward": 0.8280356526374817, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1707, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 474.53125, "epoch": 0.11754180717087606, "grad_norm": 0.4750464913383506, "kl": 0.6328125, "learning_rate": 9.66295812903914e-07, "loss": 0.0, "reward": 2.198007583618164, "reward_std": 0.009207518771290779, "rewards/accuracy_reward": 0.5730074644088745, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1708, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 452.21875, "epoch": 0.1176106255591494, "grad_norm": 0.0, "kl": 0.7734375, "learning_rate": 9.662567852333527e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1709, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 451.84375, "epoch": 0.11767944394742275, "grad_norm": 1.4675782707184442, "kl": 0.765625, "learning_rate": 9.662177357689442e-07, "loss": -0.0, "reward": 2.2739038467407227, "reward_std": 0.07040620595216751, "rewards/accuracy_reward": 0.8270288109779358, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.25, "step": 1710, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 444.171875, "epoch": 0.1177482623356961, "grad_norm": 0.9646369240009964, "kl": 0.66015625, "learning_rate": 9.661786645125132e-07, "loss": 0.0, "reward": 2.1070973873138428, "reward_std": 0.009130260907113552, "rewards/accuracy_reward": 0.9070972800254822, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1711, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 474.734375, "epoch": 0.11781708072396944, "grad_norm": 0.0, "kl": 0.70703125, "learning_rate": 9.661395714658868e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1712, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 465.84375, "epoch": 0.11788589911224279, "grad_norm": 1.453359988387367, "kl": 0.76171875, "learning_rate": 9.661004566308919e-07, "loss": 0.0, "reward": 2.334315776824951, "reward_std": 0.024759775027632713, "rewards/accuracy_reward": 0.79369056224823, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1713, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 446.53125, "epoch": 0.11795471750051614, "grad_norm": 0.49951942713158837, "kl": 0.7890625, "learning_rate": 9.66061320009357e-07, "loss": 0.0, "reward": 2.5001158714294434, "reward_std": 0.003445214591920376, "rewards/accuracy_reward": 0.9251156449317932, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1714, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 487.921875, "epoch": 0.11802353588878949, "grad_norm": 0.8809782722048433, "kl": 0.66015625, "learning_rate": 9.66022161603111e-07, "loss": 0.0, "reward": 2.12800931930542, "reward_std": 0.0036952956579625607, "rewards/accuracy_reward": 0.9280092716217041, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1715, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 467.90625, "epoch": 0.11809235427706283, "grad_norm": 0.0, "kl": 0.70703125, "learning_rate": 9.659829814139847e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1716, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 451.53125, "epoch": 0.11816117266533618, "grad_norm": 1.021681872731175, "kl": 0.63671875, "learning_rate": 9.659437794438095e-07, "loss": 0.0, "reward": 2.065624952316284, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1717, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 489.234375, "epoch": 0.11822999105360953, "grad_norm": 0.9303193664442564, "kl": 0.765625, "learning_rate": 9.659045556944173e-07, "loss": 0.0, "reward": 1.8969343900680542, "reward_std": 0.006353167816996574, "rewards/accuracy_reward": 0.7219343781471252, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1718, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 414.828125, "epoch": 0.11829880944188287, "grad_norm": 0.0, "kl": 0.8203125, "learning_rate": 9.658653101676419e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1719, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 451.171875, "epoch": 0.11836762783015622, "grad_norm": 4.428247913869887, "kl": 0.72265625, "learning_rate": 9.658260428653176e-07, "loss": -0.0, "reward": 1.9726476669311523, "reward_std": 0.054547738283872604, "rewards/accuracy_reward": 0.8132726550102234, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1720, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 439.828125, "epoch": 0.11843644621842957, "grad_norm": 1.01346994383823, "kl": 0.59765625, "learning_rate": 9.6578675378928e-07, "loss": -0.0, "reward": 1.693750023841858, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1721, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 509.78125, "epoch": 0.11850526460670291, "grad_norm": 0.0, "kl": 0.64453125, "learning_rate": 9.657474429413652e-07, "loss": 0.0, "reward": 2.2750000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.875, "rewards/transform_reward": 0.5, "step": 1722, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 477.046875, "epoch": 0.11857408299497625, "grad_norm": 0.0, "kl": 0.66796875, "learning_rate": 9.657081103234108e-07, "loss": 0.0, "reward": 2.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1723, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 447.09375, "epoch": 0.11864290138324961, "grad_norm": 2.2152039306419566, "kl": 0.85546875, "learning_rate": 9.656687559372557e-07, "loss": 0.0, "reward": 2.4312500953674316, "reward_std": 0.08256310969591141, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4375, "step": 1724, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 452.6875, "epoch": 0.11871171977152295, "grad_norm": 0.0, "kl": 0.8203125, "learning_rate": 9.656293797847388e-07, "loss": 0.0, "reward": 2.3625001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3125, "step": 1725, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 458.796875, "epoch": 0.11878053815979629, "grad_norm": 0.6957975945821434, "kl": 0.6953125, "learning_rate": 9.65589981867701e-07, "loss": 0.0, "reward": 2.106250047683716, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1726, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 485.234375, "epoch": 0.11884935654806965, "grad_norm": 2.4968820283028887, "kl": 0.6171875, "learning_rate": 9.655505621879839e-07, "loss": -0.0, "reward": 2.606250047683716, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1727, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 453.203125, "epoch": 0.11891817493634299, "grad_norm": 0.9895475765539197, "kl": 0.6171875, "learning_rate": 9.655111207474297e-07, "loss": 0.0, "reward": 1.8912907838821411, "reward_std": 0.00900282058864832, "rewards/accuracy_reward": 0.7162907123565674, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1728, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 460.28125, "epoch": 0.11898699332461633, "grad_norm": 0.43474523673978105, "kl": 0.6796875, "learning_rate": 9.654716575478824e-07, "loss": -0.0, "reward": 2.6121344566345215, "reward_std": 0.005408822558820248, "rewards/accuracy_reward": 0.9121342897415161, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1729, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 445.109375, "epoch": 0.11905581171288969, "grad_norm": 0.0, "kl": 0.80078125, "learning_rate": 9.654321725911864e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1730, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 434.453125, "epoch": 0.11912463010116303, "grad_norm": 0.0, "kl": 0.625, "learning_rate": 9.65392665879187e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1731, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 464.515625, "epoch": 0.11919344848943637, "grad_norm": 4.679887223698961, "kl": 0.671875, "learning_rate": 9.653531374137314e-07, "loss": 0.0, "reward": 2.5833377838134766, "reward_std": 0.07615824043750763, "rewards/accuracy_reward": 0.889587938785553, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1732, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 443.75, "epoch": 0.11926226687770973, "grad_norm": 5.501390269121964, "kl": 0.8203125, "learning_rate": 9.653135871966668e-07, "loss": 0.0, "reward": 2.328125, "reward_std": 0.042246490716934204, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.28125, "step": 1733, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 468.8125, "epoch": 0.11933108526598307, "grad_norm": 9.761841194501322, "kl": 0.94140625, "learning_rate": 9.652740152298421e-07, "loss": -0.0, "reward": 2.057556629180908, "reward_std": 0.0956449881196022, "rewards/accuracy_reward": 0.8731815814971924, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1734, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 460.953125, "epoch": 0.11939990365425641, "grad_norm": 7.217559268571848, "kl": 156.0, "learning_rate": 9.652344215151071e-07, "loss": -0.0, "reward": 2.2874999046325684, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1735, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 445.4375, "epoch": 0.11946872204252977, "grad_norm": 0.5843935006848547, "kl": 19.125, "learning_rate": 9.65194806054312e-07, "loss": -0.0, "reward": 2.1280336380004883, "reward_std": 0.01733306422829628, "rewards/accuracy_reward": 0.9280335307121277, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1736, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 484.671875, "epoch": 0.11953754043080311, "grad_norm": 0.8714636186989578, "kl": 39.0, "learning_rate": 9.651551688493092e-07, "loss": 0.0, "reward": 2.3533239364624023, "reward_std": 0.04652636870741844, "rewards/accuracy_reward": 0.703324019908905, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1737, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 504.546875, "epoch": 0.11960635881907646, "grad_norm": 1.2692503561714852, "kl": 8.0, "learning_rate": 9.651155099019509e-07, "loss": -0.0, "reward": 2.4562501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1738, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 517.109375, "epoch": 0.1196751772073498, "grad_norm": 0.6802620804039577, "kl": 27.5, "learning_rate": 9.650758292140907e-07, "loss": -0.0, "reward": 2.5677523612976074, "reward_std": 0.0014357110485434532, "rewards/accuracy_reward": 0.9302520751953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1739, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 499.921875, "epoch": 0.11974399559562315, "grad_norm": 0.36882873873428235, "kl": 2.5, "learning_rate": 9.65036126787584e-07, "loss": 0.0, "reward": 2.1145215034484863, "reward_std": 0.002555331913754344, "rewards/accuracy_reward": 0.914521336555481, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1740, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 540.03125, "epoch": 0.1198128139838965, "grad_norm": 0.46482775814345617, "kl": 2.84375, "learning_rate": 9.64996402624286e-07, "loss": -0.0, "reward": 1.9689667224884033, "reward_std": 0.0029933110345155, "rewards/accuracy_reward": 0.7939667701721191, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1741, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 499.84375, "epoch": 0.11988163237216984, "grad_norm": 6.076056410978984, "kl": 1.265625, "learning_rate": 9.64956656726054e-07, "loss": 0.0, "reward": 1.9655985832214355, "reward_std": 0.003433657344430685, "rewards/accuracy_reward": 0.7905985713005066, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1742, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 515.71875, "epoch": 0.1199504507604432, "grad_norm": 0.0, "kl": 1232.0, "learning_rate": 9.649168890947453e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1743, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 493.109375, "epoch": 0.12001926914871654, "grad_norm": 0.0, "kl": 334.0, "learning_rate": 9.64877099732219e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1744, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 522.578125, "epoch": 0.12008808753698988, "grad_norm": 8.385923461808616, "kl": 356.0, "learning_rate": 9.64837288640335e-07, "loss": -0.0, "reward": 1.8418492078781128, "reward_std": 0.07194210588932037, "rewards/accuracy_reward": 0.6980991959571838, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1745, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 501.90625, "epoch": 0.12015690592526324, "grad_norm": 1.4153662590568477, "kl": 840.0, "learning_rate": 9.64797455820954e-07, "loss": -0.0, "reward": 2.45276141166687, "reward_std": 0.030522987246513367, "rewards/accuracy_reward": 0.8855737447738647, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3671875, "step": 1746, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 526.703125, "epoch": 0.12022572431353658, "grad_norm": 3.698243237411774, "kl": 988.0, "learning_rate": 9.64757601275938e-07, "loss": -0.0, "reward": 2.3843750953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.5, "step": 1747, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 510.78125, "epoch": 0.12029454270180992, "grad_norm": 4.656922091452533, "kl": 110.5, "learning_rate": 9.647177250071497e-07, "loss": 0.0, "reward": 2.4321768283843994, "reward_std": 0.14656616747379303, "rewards/accuracy_reward": 0.7978017330169678, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.5, "step": 1748, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 456.421875, "epoch": 0.12036336109008328, "grad_norm": 0.0, "kl": 11.25, "learning_rate": 9.64677827016453e-07, "loss": 0.0, "reward": 2.4875001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1749, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 470.546875, "epoch": 0.12043217947835662, "grad_norm": 0.94994259622001, "kl": 0.66015625, "learning_rate": 9.646379073057133e-07, "loss": 0.0, "reward": 2.5656251907348633, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1750, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 434.90625, "epoch": 0.12050099786662996, "grad_norm": 0.0, "kl": 0.62890625, "learning_rate": 9.64597965876796e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1751, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 433.140625, "epoch": 0.12056981625490332, "grad_norm": 0.0, "kl": 0.6328125, "learning_rate": 9.645580027315681e-07, "loss": 0.0, "reward": 2.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1752, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 486.390625, "epoch": 0.12063863464317666, "grad_norm": 0.0, "kl": 0.6328125, "learning_rate": 9.645180178718979e-07, "loss": 0.0, "reward": 2.4388890266418457, "reward_std": 0.0, "rewards/accuracy_reward": 0.7638888359069824, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1753, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 459.828125, "epoch": 0.12070745303145, "grad_norm": 0.5109433308352026, "kl": 0.8203125, "learning_rate": 9.64478011299654e-07, "loss": 0.0, "reward": 2.406475305557251, "reward_std": 0.0027578091248869896, "rewards/accuracy_reward": 0.7939753532409668, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1754, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 469.859375, "epoch": 0.12077627141972334, "grad_norm": 0.5225623014551907, "kl": 0.65625, "learning_rate": 9.644379830167067e-07, "loss": -0.0, "reward": 2.3186521530151367, "reward_std": 0.0025758969131857157, "rewards/accuracy_reward": 0.6686519980430603, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1755, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 450.109375, "epoch": 0.1208450898079967, "grad_norm": 1.7464657033379027, "kl": 0.83203125, "learning_rate": 9.643979330249265e-07, "loss": 0.0, "reward": 2.6812500953674316, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1756, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 424.265625, "epoch": 0.12091390819627004, "grad_norm": 0.41279625499955513, "kl": 0.8125, "learning_rate": 9.643578613261862e-07, "loss": 0.0, "reward": 2.3261075019836426, "reward_std": 0.013031440787017345, "rewards/accuracy_reward": 0.6761074066162109, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1757, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 432.15625, "epoch": 0.12098272658454338, "grad_norm": 0.0, "kl": 0.6015625, "learning_rate": 9.64317767922358e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1758, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 441.328125, "epoch": 0.12105154497281674, "grad_norm": 0.0, "kl": 0.69140625, "learning_rate": 9.64277652815317e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1759, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 435.59375, "epoch": 0.12112036336109008, "grad_norm": 1.4955756770719981, "kl": 0.8359375, "learning_rate": 9.642375160069371e-07, "loss": 0.0, "reward": 1.9495067596435547, "reward_std": 0.028363194316625595, "rewards/accuracy_reward": 0.7745068073272705, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1760, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 431.515625, "epoch": 0.12118918174936343, "grad_norm": 0.5116329996522785, "kl": 0.625, "learning_rate": 9.641973574990952e-07, "loss": 0.0, "reward": 2.125, "reward_std": 0.08017837256193161, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1761, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 442.546875, "epoch": 0.12125800013763678, "grad_norm": 1.4696573980602077, "kl": 0.6015625, "learning_rate": 9.64157177293668e-07, "loss": -0.0, "reward": 1.9159142971038818, "reward_std": 0.08140186965465546, "rewards/accuracy_reward": 0.7502893209457397, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1762, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 428.828125, "epoch": 0.12132681852591012, "grad_norm": 1.504656998032711, "kl": 0.77734375, "learning_rate": 9.641169753925338e-07, "loss": -0.0, "reward": 2.6217803955078125, "reward_std": 0.07664769142866135, "rewards/accuracy_reward": 0.928030252456665, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1763, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 433.53125, "epoch": 0.12139563691418347, "grad_norm": 0.7790179437416906, "kl": 0.625, "learning_rate": 9.640767517975716e-07, "loss": -0.0, "reward": 2.0237679481506348, "reward_std": 0.007454583887010813, "rewards/accuracy_reward": 0.8237678408622742, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1764, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 436.3125, "epoch": 0.12146445530245682, "grad_norm": 1.1344828619344385, "kl": 0.609375, "learning_rate": 9.640365065106616e-07, "loss": 0.0, "reward": 2.106250047683716, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1765, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 441.015625, "epoch": 0.12153327369073016, "grad_norm": 0.370289779521298, "kl": 0.62890625, "learning_rate": 9.63996239533685e-07, "loss": 0.0, "reward": 2.4156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1766, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 445.453125, "epoch": 0.12160209207900351, "grad_norm": 0.49642517693479626, "kl": 0.6171875, "learning_rate": 9.63955950868524e-07, "loss": 0.0, "reward": 1.8421003818511963, "reward_std": 0.0076916939578950405, "rewards/accuracy_reward": 0.6921004056930542, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1767, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 439.296875, "epoch": 0.12167091046727686, "grad_norm": 0.6772659766927488, "kl": 0.6171875, "learning_rate": 9.639156405170615e-07, "loss": -0.0, "reward": 2.087125062942505, "reward_std": 0.005453835241496563, "rewards/accuracy_reward": 0.8871249556541443, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1768, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 429.046875, "epoch": 0.1217397288555502, "grad_norm": 0.0, "kl": 0.63671875, "learning_rate": 9.638753084811818e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1769, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 471.5, "epoch": 0.12180854724382355, "grad_norm": 1.5908919202509642, "kl": 0.6171875, "learning_rate": 9.638349547627705e-07, "loss": -0.0, "reward": 2.065624952316284, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1770, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 481.75, "epoch": 0.12187736563209689, "grad_norm": 0.7048252000694546, "kl": 0.59375, "learning_rate": 9.637945793637133e-07, "loss": -0.0, "reward": 2.047292470932007, "reward_std": 0.005409743636846542, "rewards/accuracy_reward": 0.847292423248291, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1771, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 477.484375, "epoch": 0.12194618402037025, "grad_norm": 0.8323788104970508, "kl": 0.5859375, "learning_rate": 9.637541822858976e-07, "loss": -0.0, "reward": 2.406151294708252, "reward_std": 0.011317425407469273, "rewards/accuracy_reward": 0.7311511635780334, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1772, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 502.703125, "epoch": 0.12201500240864359, "grad_norm": 1.451630865240855, "kl": 0.7109375, "learning_rate": 9.637137635312117e-07, "loss": 0.0, "reward": 2.25056791305542, "reward_std": 0.006531103048473597, "rewards/accuracy_reward": 0.6005679965019226, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1773, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 472.84375, "epoch": 0.12208382079691693, "grad_norm": 1.066301202625983, "kl": 0.6171875, "learning_rate": 9.636733231015447e-07, "loss": -0.0, "reward": 2.4562501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1774, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.375, "completion_length": 504.578125, "epoch": 0.12215263918519029, "grad_norm": 1.016019762683987, "kl": 0.8046875, "learning_rate": 9.636328609987873e-07, "loss": 0.0, "reward": 1.5542352199554443, "reward_std": 0.07119593024253845, "rewards/accuracy_reward": 0.4479852318763733, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1775, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 484.34375, "epoch": 0.12222145757346363, "grad_norm": 0.0, "kl": 0.77734375, "learning_rate": 9.635923772248305e-07, "loss": 0.0, "reward": 2.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1776, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 477.625, "epoch": 0.12229027596173697, "grad_norm": 0.8728482907501525, "kl": 0.60546875, "learning_rate": 9.635518717815666e-07, "loss": 0.0, "reward": 2.1812500953674316, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1777, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 476.09375, "epoch": 0.12235909435001033, "grad_norm": 0.0, "kl": 0.76171875, "learning_rate": 9.635113446708887e-07, "loss": 0.0, "reward": 2.450000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.25, "step": 1778, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 466.703125, "epoch": 0.12242791273828367, "grad_norm": 1.357342703130483, "kl": 0.58984375, "learning_rate": 9.634707958946915e-07, "loss": 0.0, "reward": 2.512500047683716, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1779, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 483.203125, "epoch": 0.12249673112655701, "grad_norm": 0.0, "kl": 0.7578125, "learning_rate": 9.6343022545487e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1780, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 460.359375, "epoch": 0.12256554951483037, "grad_norm": 0.6903537788605651, "kl": 0.6015625, "learning_rate": 9.633896333533209e-07, "loss": 0.0, "reward": 1.915624976158142, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1781, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 480.65625, "epoch": 0.12263436790310371, "grad_norm": 0.0, "kl": 0.60546875, "learning_rate": 9.633490195919413e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1782, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 494.125, "epoch": 0.12270318629137705, "grad_norm": 0.5484538378183336, "kl": 0.71484375, "learning_rate": 9.633083841726296e-07, "loss": 0.0, "reward": 2.3517332077026367, "reward_std": 0.00393828796222806, "rewards/accuracy_reward": 0.8017330765724182, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1783, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 496.90625, "epoch": 0.12277200467965041, "grad_norm": 0.6465097686361182, "kl": 0.7265625, "learning_rate": 9.632677270972856e-07, "loss": -0.0, "reward": 2.497340679168701, "reward_std": 0.0023517701774835587, "rewards/accuracy_reward": 0.9223406314849854, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1784, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 499.515625, "epoch": 0.12284082306792375, "grad_norm": 0.0, "kl": 0.79296875, "learning_rate": 9.632270483678088e-07, "loss": 0.0, "reward": 2.09015154838562, "reward_std": 0.0, "rewards/accuracy_reward": 0.8901515007019043, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1785, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 503.28125, "epoch": 0.1229096414561971, "grad_norm": 0.7674476952640061, "kl": 0.72265625, "learning_rate": 9.631863479861016e-07, "loss": 0.0, "reward": 2.151338577270508, "reward_std": 0.09815796464681625, "rewards/accuracy_reward": 0.6013385653495789, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.4375, "step": 1786, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 502.75, "epoch": 0.12297845984447044, "grad_norm": 0.36040488903129203, "kl": 0.5546875, "learning_rate": 9.631456259540659e-07, "loss": -0.0, "reward": 1.8245000839233398, "reward_std": 0.002672099508345127, "rewards/accuracy_reward": 0.6744999885559082, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1787, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 482.203125, "epoch": 0.12304727823274379, "grad_norm": 0.42056487985352325, "kl": 0.58203125, "learning_rate": 9.63104882273605e-07, "loss": -0.0, "reward": 1.9156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1788, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 490.34375, "epoch": 0.12311609662101713, "grad_norm": 0.0, "kl": 0.5859375, "learning_rate": 9.63064116946624e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1789, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 523.296875, "epoch": 0.12318491500929048, "grad_norm": 2.117559460159666, "kl": 0.546875, "learning_rate": 9.630233299750274e-07, "loss": 0.0, "reward": 2.2926228046417236, "reward_std": 0.01612704247236252, "rewards/accuracy_reward": 0.642622709274292, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1790, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 532.859375, "epoch": 0.12325373339756383, "grad_norm": 1.2876567049974534, "kl": 0.66796875, "learning_rate": 9.629825213607225e-07, "loss": -0.0, "reward": 2.3789682388305664, "reward_std": 0.0831354558467865, "rewards/accuracy_reward": 0.7195930480957031, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1791, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 539.84375, "epoch": 0.12332255178583718, "grad_norm": 8.921193358040215, "kl": 0.640625, "learning_rate": 9.629416911056166e-07, "loss": 0.0, "reward": 2.512500047683716, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1792, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 559.859375, "epoch": 0.12339137017411052, "grad_norm": 0.42588583282489845, "kl": 0.53515625, "learning_rate": 9.629008392116177e-07, "loss": -0.0, "reward": 2.4751031398773193, "reward_std": 0.007806976791471243, "rewards/accuracy_reward": 0.8001030683517456, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1793, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 532.546875, "epoch": 0.12346018856238387, "grad_norm": 0.9803427446759, "kl": 0.515625, "learning_rate": 9.62859965680636e-07, "loss": -0.0, "reward": 1.6534771919250488, "reward_std": 0.09019255638122559, "rewards/accuracy_reward": 0.4909772574901581, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1794, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 475.515625, "epoch": 0.12352900695065722, "grad_norm": 0.0, "kl": 0.5625, "learning_rate": 9.628190705145815e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1795, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 482.25, "epoch": 0.12359782533893056, "grad_norm": 1.3016795971270971, "kl": 0.63671875, "learning_rate": 9.62778153715366e-07, "loss": 0.0, "reward": 2.48991322517395, "reward_std": 0.020471885800361633, "rewards/accuracy_reward": 0.8149131536483765, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1796, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 535.75, "epoch": 0.12366664372720391, "grad_norm": 0.0, "kl": 0.71484375, "learning_rate": 9.62737215284902e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1797, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 574.53125, "epoch": 0.12373546211547726, "grad_norm": 0.5568072487719984, "kl": 0.5234375, "learning_rate": 9.62696255225103e-07, "loss": 0.0, "reward": 2.2265625, "reward_std": 0.06629125773906708, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1798, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 586.9375, "epoch": 0.1238042805037506, "grad_norm": 0.3402199732310535, "kl": 0.5078125, "learning_rate": 9.626552735378835e-07, "loss": 0.0, "reward": 1.8721128702163696, "reward_std": 0.10383790731430054, "rewards/accuracy_reward": 0.7721128463745117, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.0, "step": 1799, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 587.40625, "epoch": 0.12387309889202396, "grad_norm": 0.714885934876993, "kl": 0.5078125, "learning_rate": 9.626142702251591e-07, "loss": 0.0, "reward": 2.031552314758301, "reward_std": 0.15835809707641602, "rewards/accuracy_reward": 0.8753024339675903, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 1800, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 501.796875, "epoch": 0.1239419172802973, "grad_norm": 0.9613391836653133, "kl": 0.69921875, "learning_rate": 9.625732452888466e-07, "loss": 0.0, "reward": 2.28125, "reward_std": 0.12246952950954437, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1801, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 564.609375, "epoch": 0.12401073566857064, "grad_norm": 1.1128997774732352, "kl": 0.5, "learning_rate": 9.625321987308634e-07, "loss": 0.0, "reward": 2.3830394744873047, "reward_std": 0.2254602611064911, "rewards/accuracy_reward": 0.7846018075942993, "rewards/format_reward": 0.921875, "rewards/transform_reward": 0.4921875, "step": 1802, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 561.140625, "epoch": 0.12407955405684398, "grad_norm": 0.5241490506698497, "kl": 0.49609375, "learning_rate": 9.62491130553128e-07, "loss": -0.0, "reward": 2.4693045616149902, "reward_std": 0.1790734827518463, "rewards/accuracy_reward": 0.8489919900894165, "rewards/format_reward": 0.953125, "rewards/transform_reward": 0.4765625, "step": 1803, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 546.71875, "epoch": 0.12414837244511734, "grad_norm": 0.5107537917426904, "kl": 0.5234375, "learning_rate": 9.624500407575604e-07, "loss": 0.0, "reward": 2.478508234024048, "reward_std": 0.0018087909556925297, "rewards/accuracy_reward": 0.8035081624984741, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1804, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 585.5625, "epoch": 0.12421719083339068, "grad_norm": 3.127097267342884, "kl": 0.50390625, "learning_rate": 9.624089293460807e-07, "loss": 0.0, "reward": 1.7240010499954224, "reward_std": 0.2257099151611328, "rewards/accuracy_reward": 0.7083759903907776, "rewards/format_reward": 0.875, "rewards/transform_reward": 0.0, "step": 1805, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 534.59375, "epoch": 0.12428600922166402, "grad_norm": 0.0, "kl": 0.53125, "learning_rate": 9.62367796320611e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1806, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 492.140625, "epoch": 0.12435482760993738, "grad_norm": 0.0, "kl": 0.55859375, "learning_rate": 9.623266416830735e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1807, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 486.75, "epoch": 0.12442364599821072, "grad_norm": 1.0674628885154132, "kl": 0.609375, "learning_rate": 9.622854654353923e-07, "loss": 0.0, "reward": 2.4135522842407227, "reward_std": 0.17936022579669952, "rewards/accuracy_reward": 0.7573021650314331, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1808, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 478.453125, "epoch": 0.12449246438648406, "grad_norm": 0.6933603128496586, "kl": 0.6484375, "learning_rate": 9.62244267579492e-07, "loss": -0.0, "reward": 2.4937500953674316, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1809, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 536.703125, "epoch": 0.12456128277475742, "grad_norm": 0.33601338185636465, "kl": 0.51953125, "learning_rate": 9.622030481172979e-07, "loss": 0.0, "reward": 1.8656251430511475, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1810, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 517.453125, "epoch": 0.12463010116303076, "grad_norm": 0.0, "kl": 0.5546875, "learning_rate": 9.62161807050737e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1811, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 503.6875, "epoch": 0.1246989195513041, "grad_norm": 1.0031228532847067, "kl": 0.5390625, "learning_rate": 9.62120544381737e-07, "loss": 0.0, "reward": 2.0658297538757324, "reward_std": 0.08103036135435104, "rewards/accuracy_reward": 0.8752047419548035, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1812, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 489.890625, "epoch": 0.12476773793957746, "grad_norm": 0.8724091975652067, "kl": 0.55859375, "learning_rate": 9.620792601122266e-07, "loss": 0.0, "reward": 2.3578124046325684, "reward_std": 0.11932426691055298, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1813, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 475.328125, "epoch": 0.1248365563278508, "grad_norm": 4.420822196284107, "kl": 0.55078125, "learning_rate": 9.620379542441353e-07, "loss": -0.0, "reward": 1.7797043323516846, "reward_std": 0.05504102259874344, "rewards/accuracy_reward": 0.6297043561935425, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1814, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 496.0, "epoch": 0.12490537471612415, "grad_norm": 0.8524936321718284, "kl": 0.54296875, "learning_rate": 9.61996626779394e-07, "loss": -0.0, "reward": 2.3081626892089844, "reward_std": 0.0570453479886055, "rewards/accuracy_reward": 0.6612879037857056, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1815, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 511.734375, "epoch": 0.1249741931043975, "grad_norm": 0.2610160110016368, "kl": 0.56640625, "learning_rate": 9.619552777199345e-07, "loss": 0.0, "reward": 2.5078125, "reward_std": 0.11932426691055298, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1816, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 510.859375, "epoch": 0.12504301149267083, "grad_norm": 0.5773290331872222, "kl": 0.546875, "learning_rate": 9.619139070676894e-07, "loss": 0.0, "reward": 2.4741320610046387, "reward_std": 0.0029823007062077522, "rewards/accuracy_reward": 0.7991318702697754, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1817, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 536.828125, "epoch": 0.1251118298809442, "grad_norm": 0.9120041014400547, "kl": 0.6953125, "learning_rate": 9.618725148245926e-07, "loss": -0.0, "reward": 2.3544459342956543, "reward_std": 0.10862581431865692, "rewards/accuracy_reward": 0.7919459342956543, "rewards/format_reward": 0.9375, "rewards/transform_reward": 0.4375, "step": 1818, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 484.296875, "epoch": 0.12518064826921754, "grad_norm": 1.4033292258443792, "kl": 0.60546875, "learning_rate": 9.618311009925788e-07, "loss": 0.0, "reward": 1.7954431772232056, "reward_std": 0.07222801446914673, "rewards/accuracy_reward": 0.6516931056976318, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1819, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 470.515625, "epoch": 0.12524946665749087, "grad_norm": 0.0, "kl": 0.6015625, "learning_rate": 9.617896655735835e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1820, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 485.140625, "epoch": 0.12531828504576423, "grad_norm": 0.5419636061064842, "kl": 0.58203125, "learning_rate": 9.617482085695441e-07, "loss": -0.0, "reward": 2.6572458744049072, "reward_std": 0.013875906355679035, "rewards/accuracy_reward": 0.9572458267211914, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1821, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 474.46875, "epoch": 0.12538710343403758, "grad_norm": 0.4934084008601197, "kl": 0.59765625, "learning_rate": 9.617067299823977e-07, "loss": -0.0, "reward": 1.8375855684280396, "reward_std": 0.0028028334490954876, "rewards/accuracy_reward": 0.6875855326652527, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1822, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 504.34375, "epoch": 0.1254559218223109, "grad_norm": 1.9582411567842428, "kl": 0.5546875, "learning_rate": 9.616652298140836e-07, "loss": -0.0, "reward": 2.5373575687408447, "reward_std": 0.05892045050859451, "rewards/accuracy_reward": 0.8404825329780579, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1823, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 489.859375, "epoch": 0.12552474021058427, "grad_norm": 0.0, "kl": 0.5625, "learning_rate": 9.616237080665413e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1824, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 473.984375, "epoch": 0.12559355859885762, "grad_norm": 0.6143873419653421, "kl": 0.609375, "learning_rate": 9.615821647417117e-07, "loss": -0.0, "reward": 1.9758641719818115, "reward_std": 0.0051414258778095245, "rewards/accuracy_reward": 0.800864040851593, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1825, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 473.21875, "epoch": 0.12566237698713095, "grad_norm": 0.0, "kl": 0.6171875, "learning_rate": 9.61540599841537e-07, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1826, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 483.921875, "epoch": 0.1257311953754043, "grad_norm": 0.46149234790925076, "kl": 0.6015625, "learning_rate": 9.614990133679595e-07, "loss": -0.0, "reward": 1.8203279972076416, "reward_std": 0.003027022583410144, "rewards/accuracy_reward": 0.67032790184021, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1827, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 498.171875, "epoch": 0.12580001376367766, "grad_norm": 1.1193091174078542, "kl": 0.5625, "learning_rate": 9.614574053229233e-07, "loss": 0.0, "reward": 1.8062500953674316, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1828, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 472.109375, "epoch": 0.125868832151951, "grad_norm": 1.1269971593351122, "kl": 0.62109375, "learning_rate": 9.61415775708373e-07, "loss": -0.0, "reward": 2.310502767562866, "reward_std": 0.01200297474861145, "rewards/accuracy_reward": 0.635502815246582, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1829, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 512.015625, "epoch": 0.12593765054022435, "grad_norm": 0.0, "kl": 0.53515625, "learning_rate": 9.613741245262548e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1830, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 518.03125, "epoch": 0.1260064689284977, "grad_norm": 1.130454097094035, "kl": 0.56640625, "learning_rate": 9.613324517785153e-07, "loss": 0.0, "reward": 1.9633218050003052, "reward_std": 0.1625348925590515, "rewards/accuracy_reward": 0.8195717930793762, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 1831, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 501.390625, "epoch": 0.12607528731677103, "grad_norm": 0.0, "kl": 0.7265625, "learning_rate": 9.612907574671025e-07, "loss": 0.0, "reward": 2.0999999046325684, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1832, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 470.671875, "epoch": 0.1261441057050444, "grad_norm": 0.8248071172328342, "kl": 0.62890625, "learning_rate": 9.612490415939653e-07, "loss": 0.0, "reward": 2.456026792526245, "reward_std": 0.02207360602915287, "rewards/accuracy_reward": 0.7810268998146057, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1833, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 470.5625, "epoch": 0.12621292409331775, "grad_norm": 0.0, "kl": 0.6640625, "learning_rate": 9.612073041610536e-07, "loss": 0.0, "reward": 2.4000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1834, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 525.46875, "epoch": 0.12628174248159107, "grad_norm": 0.37840373605623007, "kl": 0.546875, "learning_rate": 9.611655451703183e-07, "loss": 0.0, "reward": 2.6117196083068848, "reward_std": 0.003670122241601348, "rewards/accuracy_reward": 0.9117193818092346, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1835, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 465.828125, "epoch": 0.12635056086986443, "grad_norm": 1.0934187395496375, "kl": 0.625, "learning_rate": 9.611237646237112e-07, "loss": -0.0, "reward": 1.934602975845337, "reward_std": 0.07134626060724258, "rewards/accuracy_reward": 0.7658529281616211, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1836, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 506.921875, "epoch": 0.1264193792581378, "grad_norm": 0.0, "kl": 0.56640625, "learning_rate": 9.610819625231854e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1837, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 486.46875, "epoch": 0.12648819764641112, "grad_norm": 0.8173395972991294, "kl": 0.7734375, "learning_rate": 9.610401388706945e-07, "loss": 0.0, "reward": 2.1812500953674316, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1838, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 467.875, "epoch": 0.12655701603468447, "grad_norm": 0.0, "kl": 0.6171875, "learning_rate": 9.609982936681938e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1839, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 498.890625, "epoch": 0.12662583442295783, "grad_norm": 0.4806979170849283, "kl": 0.578125, "learning_rate": 9.60956426917639e-07, "loss": -0.0, "reward": 2.3532180786132812, "reward_std": 0.00574507424607873, "rewards/accuracy_reward": 0.7032179832458496, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1840, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 463.328125, "epoch": 0.12669465281123116, "grad_norm": 0.0, "kl": 0.62890625, "learning_rate": 9.60914538620987e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1841, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 463.703125, "epoch": 0.1267634711995045, "grad_norm": 0.0, "kl": 0.6328125, "learning_rate": 9.60872628780196e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1842, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 478.640625, "epoch": 0.12683228958777784, "grad_norm": 0.0, "kl": 0.609375, "learning_rate": 9.608306973972246e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1843, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 478.515625, "epoch": 0.1269011079760512, "grad_norm": 0.0, "kl": 0.61328125, "learning_rate": 9.607887444740331e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1844, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 564.75, "epoch": 0.12696992636432455, "grad_norm": 0.8775411248050511, "kl": 0.52734375, "learning_rate": 9.607467700125825e-07, "loss": -0.0, "reward": 1.6324875354766846, "reward_std": 0.22631990909576416, "rewards/accuracy_reward": 0.6074874401092529, "rewards/format_reward": 0.875, "rewards/transform_reward": 0.0, "step": 1845, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 493.453125, "epoch": 0.12703874475259788, "grad_norm": 0.9416328008683922, "kl": 0.7265625, "learning_rate": 9.607047740148342e-07, "loss": -0.0, "reward": 1.8298155069351196, "reward_std": 0.051107343286275864, "rewards/accuracy_reward": 0.679815411567688, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1846, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 476.703125, "epoch": 0.12710756314087124, "grad_norm": 0.6832548507602672, "kl": 0.62109375, "learning_rate": 9.60662756482752e-07, "loss": -0.0, "reward": 1.9009469747543335, "reward_std": 0.0026784322690218687, "rewards/accuracy_reward": 0.7509469985961914, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1847, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 484.890625, "epoch": 0.1271763815291446, "grad_norm": 1.1336778104244394, "kl": 0.59375, "learning_rate": 9.606207174182992e-07, "loss": 0.0, "reward": 2.012500047683716, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1848, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 488.640625, "epoch": 0.12724519991741792, "grad_norm": 1.7200011106102686, "kl": 0.609375, "learning_rate": 9.605786568234412e-07, "loss": 0.0, "reward": 2.034860610961914, "reward_std": 0.10616108775138855, "rewards/accuracy_reward": 0.8723604679107666, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 1849, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 528.75, "epoch": 0.12731401830569128, "grad_norm": 0.0, "kl": 0.609375, "learning_rate": 9.60536574700144e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1850, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.375, "completion_length": 479.46875, "epoch": 0.12738283669396464, "grad_norm": 0.0, "kl": 0.59765625, "learning_rate": 9.604944710503744e-07, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1851, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 511.703125, "epoch": 0.12745165508223796, "grad_norm": 0.0, "kl": 0.5859375, "learning_rate": 9.604523458761008e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1852, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 460.71875, "epoch": 0.12752047347051132, "grad_norm": 0.7937242271249947, "kl": 0.6328125, "learning_rate": 9.604101991792915e-07, "loss": 0.0, "reward": 2.012500047683716, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1853, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 455.171875, "epoch": 0.12758929185878468, "grad_norm": 0.0, "kl": 0.65625, "learning_rate": 9.60368030961917e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1854, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 501.859375, "epoch": 0.127658110247058, "grad_norm": 0.0, "kl": 0.67578125, "learning_rate": 9.603258412259486e-07, "loss": 0.0, "reward": 2.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1855, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 481.3125, "epoch": 0.12772692863533136, "grad_norm": 0.0, "kl": 0.7421875, "learning_rate": 9.60283629973358e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1856, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 541.421875, "epoch": 0.12779574702360472, "grad_norm": 1.124737993850691, "kl": 0.70703125, "learning_rate": 9.60241397206118e-07, "loss": -0.0, "reward": 1.9986598491668701, "reward_std": 0.09711579978466034, "rewards/accuracy_reward": 0.8361598253250122, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.0, "step": 1857, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 504.5, "epoch": 0.12786456541187805, "grad_norm": 0.0, "kl": 0.7578125, "learning_rate": 9.60199142926203e-07, "loss": 0.0, "reward": 2.3625001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3125, "step": 1858, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 466.484375, "epoch": 0.1279333838001514, "grad_norm": 0.43953156035835106, "kl": 0.62109375, "learning_rate": 9.601568671355882e-07, "loss": -0.0, "reward": 2.1092371940612793, "reward_std": 0.0016125631518661976, "rewards/accuracy_reward": 0.9092372059822083, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1859, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 482.546875, "epoch": 0.12800220218842476, "grad_norm": 1.053876118884947, "kl": 0.765625, "learning_rate": 9.60114569836249e-07, "loss": -0.0, "reward": 1.709375023841858, "reward_std": 0.12182654440402985, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1860, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 484.203125, "epoch": 0.12807102057669809, "grad_norm": 0.0, "kl": 0.75390625, "learning_rate": 9.600722510301636e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1861, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 502.625, "epoch": 0.12813983896497144, "grad_norm": 1.0899390117407566, "kl": 0.7265625, "learning_rate": 9.600299107193088e-07, "loss": 0.0, "reward": 2.2683961391448975, "reward_std": 0.1482487916946411, "rewards/accuracy_reward": 0.74652099609375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.375, "step": 1862, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 528.390625, "epoch": 0.1282086573532448, "grad_norm": 1.6485623621844876, "kl": 0.72265625, "learning_rate": 9.599875489056646e-07, "loss": 0.0, "reward": 1.5835764408111572, "reward_std": 0.12835273146629333, "rewards/accuracy_reward": 0.46795135736465454, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1863, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 478.6875, "epoch": 0.12827747574151813, "grad_norm": 0.0, "kl": 0.7265625, "learning_rate": 9.599451655912107e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1864, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 521.625, "epoch": 0.12834629412979148, "grad_norm": 0.5203517338402514, "kl": 0.62109375, "learning_rate": 9.599027607779282e-07, "loss": -0.0, "reward": 2.619786262512207, "reward_std": 0.0021264278329908848, "rewards/accuracy_reward": 0.9197863340377808, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1865, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 441.140625, "epoch": 0.12841511251806484, "grad_norm": 0.6424707363779527, "kl": 0.6484375, "learning_rate": 9.598603344677992e-07, "loss": -0.0, "reward": 2.619340419769287, "reward_std": 0.010056725703179836, "rewards/accuracy_reward": 0.9224652647972107, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1866, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 449.21875, "epoch": 0.12848393090633817, "grad_norm": 1.5689359389105104, "kl": 0.75, "learning_rate": 9.598178866628067e-07, "loss": -0.0, "reward": 2.4562501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1867, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 443.6875, "epoch": 0.12855274929461152, "grad_norm": 1.6098062193869918, "kl": 0.6328125, "learning_rate": 9.59775417364935e-07, "loss": -0.0, "reward": 2.125, "reward_std": 0.08017837256193161, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1868, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 470.890625, "epoch": 0.12862156768288488, "grad_norm": 0.0, "kl": 0.6640625, "learning_rate": 9.597329265761693e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1869, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 451.859375, "epoch": 0.1286903860711582, "grad_norm": 0.6111192297619247, "kl": 0.78125, "learning_rate": 9.596904142984953e-07, "loss": 0.0, "reward": 2.1164612770080566, "reward_std": 0.00555745605379343, "rewards/accuracy_reward": 0.5539612770080566, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1870, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.25, "completion_length": 465.703125, "epoch": 0.12875920445943156, "grad_norm": 0.0, "kl": 0.58984375, "learning_rate": 9.596478805339005e-07, "loss": 0.0, "reward": 1.9000000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1871, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 468.1875, "epoch": 0.12882802284770492, "grad_norm": 3.155470986389447, "kl": 0.65234375, "learning_rate": 9.59605325284373e-07, "loss": -0.0, "reward": 1.9937500953674316, "reward_std": 0.21650539338588715, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1872, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 426.90625, "epoch": 0.12889684123597825, "grad_norm": 0.9491946532777171, "kl": 0.59765625, "learning_rate": 9.595627485519017e-07, "loss": 0.0, "reward": 2.125, "reward_std": 0.13887301087379456, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1873, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 474.359375, "epoch": 0.1289656596242516, "grad_norm": 0.6139408012808504, "kl": 0.6171875, "learning_rate": 9.595201503384766e-07, "loss": -0.0, "reward": 2.6322426795959473, "reward_std": 0.012550569139420986, "rewards/accuracy_reward": 0.932242751121521, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1874, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 458.765625, "epoch": 0.12903447801252493, "grad_norm": 0.7828884779827712, "kl": 0.58984375, "learning_rate": 9.594775306460893e-07, "loss": -0.0, "reward": 2.6237244606018066, "reward_std": 0.003945815376937389, "rewards/accuracy_reward": 0.9237245321273804, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1875, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 482.640625, "epoch": 0.1291032964007983, "grad_norm": 0.7287267035658554, "kl": 0.640625, "learning_rate": 9.594348894767319e-07, "loss": 0.0, "reward": 1.948254942893982, "reward_std": 0.07527752220630646, "rewards/accuracy_reward": 0.7795048952102661, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1876, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 469.15625, "epoch": 0.12917211478907165, "grad_norm": 1.0069659448304684, "kl": 0.6328125, "learning_rate": 9.593922268323971e-07, "loss": 0.0, "reward": 2.191988945007324, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.5732391476631165, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1877, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 440.921875, "epoch": 0.12924093317734497, "grad_norm": 0.0, "kl": 0.8125, "learning_rate": 9.593495427150792e-07, "loss": 0.0, "reward": 2.700000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1878, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 462.671875, "epoch": 0.12930975156561833, "grad_norm": 0.4359365909921414, "kl": 0.66796875, "learning_rate": 9.593068371267737e-07, "loss": 0.0, "reward": 1.984429121017456, "reward_std": 0.003946966491639614, "rewards/accuracy_reward": 0.8094290494918823, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1879, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 420.96875, "epoch": 0.1293785699538917, "grad_norm": 0.0, "kl": 0.609375, "learning_rate": 9.59264110069476e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1880, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 418.171875, "epoch": 0.12944738834216502, "grad_norm": 0.0, "kl": 0.60546875, "learning_rate": 9.592213615451842e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1881, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 414.828125, "epoch": 0.12951620673043837, "grad_norm": 0.6300970969852345, "kl": 0.609375, "learning_rate": 9.591785915558959e-07, "loss": 0.0, "reward": 1.7960638999938965, "reward_std": 0.003115026978775859, "rewards/accuracy_reward": 0.6460638046264648, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1882, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 433.578125, "epoch": 0.12958502511871173, "grad_norm": 0.8494328259158789, "kl": 0.71484375, "learning_rate": 9.591358001036102e-07, "loss": -0.0, "reward": 2.1951727867126465, "reward_std": 0.016490407288074493, "rewards/accuracy_reward": 0.7139227390289307, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3125, "step": 1883, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 421.515625, "epoch": 0.12965384350698506, "grad_norm": 0.7118432221965063, "kl": 0.59375, "learning_rate": 9.590929871903276e-07, "loss": 0.0, "reward": 1.8178006410598755, "reward_std": 0.011702937074005604, "rewards/accuracy_reward": 0.6709256768226624, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1884, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 431.546875, "epoch": 0.1297226618952584, "grad_norm": 1.1969415370619898, "kl": 0.75390625, "learning_rate": 9.59050152818049e-07, "loss": -0.0, "reward": 2.2625222206115723, "reward_std": 0.07831914722919464, "rewards/accuracy_reward": 0.6062721014022827, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1885, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 410.21875, "epoch": 0.12979148028353177, "grad_norm": 4.351708204485565, "kl": 0.61328125, "learning_rate": 9.590072969887768e-07, "loss": 0.0, "reward": 1.87108314037323, "reward_std": 0.13973532617092133, "rewards/accuracy_reward": 0.7179580926895142, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1886, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 414.65625, "epoch": 0.1298602986718051, "grad_norm": 2.771485211336383, "kl": 0.76953125, "learning_rate": 9.58964419704514e-07, "loss": 0.0, "reward": 2.137126922607422, "reward_std": 0.09613067656755447, "rewards/accuracy_reward": 0.6246269941329956, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1887, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.546875, "epoch": 0.12992911706007845, "grad_norm": 0.0, "kl": 0.625, "learning_rate": 9.589215209672646e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1888, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 417.0, "epoch": 0.1299979354483518, "grad_norm": 0.8621936160318693, "kl": 0.6171875, "learning_rate": 9.588786007790341e-07, "loss": 0.0, "reward": 1.9562500715255737, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1889, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 413.046875, "epoch": 0.13006675383662514, "grad_norm": 0.0, "kl": 0.62109375, "learning_rate": 9.588356591418286e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1890, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 381.234375, "epoch": 0.1301355722248985, "grad_norm": 1.8071920337302274, "kl": 0.74609375, "learning_rate": 9.587926960576554e-07, "loss": 0.0, "reward": 2.53125, "reward_std": 0.18213431537151337, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1891, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 426.390625, "epoch": 0.13020439061317185, "grad_norm": 3.90182486323414, "kl": 0.78125, "learning_rate": 9.587497115285223e-07, "loss": 0.0, "reward": 1.628405213356018, "reward_std": 0.1745598167181015, "rewards/accuracy_reward": 0.5002802014350891, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1892, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 402.46875, "epoch": 0.13027320900144518, "grad_norm": 0.8701126299399643, "kl": 0.7421875, "learning_rate": 9.58706705556439e-07, "loss": -0.0, "reward": 2.474120616912842, "reward_std": 0.011592415161430836, "rewards/accuracy_reward": 0.8022454977035522, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1893, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 425.125, "epoch": 0.13034202738971853, "grad_norm": 0.5221062515146265, "kl": 0.58203125, "learning_rate": 9.586636781434152e-07, "loss": -0.0, "reward": 1.6798622608184814, "reward_std": 0.010438848286867142, "rewards/accuracy_reward": 0.5548622012138367, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1894, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 437.09375, "epoch": 0.1304108457779919, "grad_norm": 0.7976339518395527, "kl": 0.640625, "learning_rate": 9.586206292914623e-07, "loss": 0.0, "reward": 2.062061309814453, "reward_std": 0.01737387292087078, "rewards/accuracy_reward": 0.8651862144470215, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1895, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 418.671875, "epoch": 0.13047966416626522, "grad_norm": 0.6525162462519513, "kl": 0.77734375, "learning_rate": 9.585775590025926e-07, "loss": 0.0, "reward": 2.418750047683716, "reward_std": 0.011572758667171001, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1896, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.75, "epoch": 0.13054848255453858, "grad_norm": 0.0, "kl": 0.6484375, "learning_rate": 9.585344672788193e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1897, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 417.109375, "epoch": 0.13061730094281193, "grad_norm": 1.2939416966421315, "kl": 0.7265625, "learning_rate": 9.584913541221563e-07, "loss": 0.0, "reward": 2.4164228439331055, "reward_std": 0.013573510572314262, "rewards/accuracy_reward": 0.9039226770401001, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3125, "step": 1898, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 416.734375, "epoch": 0.13068611933108526, "grad_norm": 1.405801367314241, "kl": 0.671875, "learning_rate": 9.584482195346194e-07, "loss": -0.0, "reward": 2.613137722015381, "reward_std": 0.014381229877471924, "rewards/accuracy_reward": 0.9193874597549438, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1899, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 421.59375, "epoch": 0.13075493771935862, "grad_norm": 1.8802008377991728, "kl": 0.66796875, "learning_rate": 9.584050635182241e-07, "loss": 0.0, "reward": 1.8062500953674316, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1900, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 451.75, "epoch": 0.13082375610763197, "grad_norm": 3.5739872415008924, "kl": 0.75, "learning_rate": 9.58361886074988e-07, "loss": -0.0, "reward": 2.0531249046325684, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.4375, "step": 1901, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 425.640625, "epoch": 0.1308925744959053, "grad_norm": 0.9083464854138473, "kl": 0.6171875, "learning_rate": 9.583186872069292e-07, "loss": -0.0, "reward": 2.0789871215820312, "reward_std": 0.08046352118253708, "rewards/accuracy_reward": 0.8946119546890259, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1902, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 435.046875, "epoch": 0.13096139288417866, "grad_norm": 3.2048528318064853, "kl": 0.625, "learning_rate": 9.58275466916067e-07, "loss": -0.0, "reward": 1.9040833711624146, "reward_std": 0.08921078592538834, "rewards/accuracy_reward": 0.7384582757949829, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1903, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 419.734375, "epoch": 0.131030211272452, "grad_norm": 0.7343610222316658, "kl": 0.6171875, "learning_rate": 9.582322252044216e-07, "loss": -0.0, "reward": 1.8744122982025146, "reward_std": 0.01822376251220703, "rewards/accuracy_reward": 0.7025372385978699, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1904, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 461.765625, "epoch": 0.13109902966072534, "grad_norm": 0.6978568310493106, "kl": 0.59765625, "learning_rate": 9.58188962074014e-07, "loss": -0.0, "reward": 2.33447265625, "reward_std": 0.048697128891944885, "rewards/accuracy_reward": 0.6844725608825684, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1905, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 440.1875, "epoch": 0.1311678480489987, "grad_norm": 0.7466254800994578, "kl": 0.5859375, "learning_rate": 9.581456775268668e-07, "loss": -0.0, "reward": 1.9000000953674316, "reward_std": 0.155264750123024, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1906, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 440.703125, "epoch": 0.13123666643727203, "grad_norm": 0.9487401838625816, "kl": 0.6328125, "learning_rate": 9.581023715650027e-07, "loss": -0.0, "reward": 2.265625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1907, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 476.71875, "epoch": 0.13130548482554538, "grad_norm": 0.0, "kl": 0.54296875, "learning_rate": 9.580590441904463e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1908, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 451.1875, "epoch": 0.13137430321381874, "grad_norm": 0.0, "kl": 0.58984375, "learning_rate": 9.58015695405223e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1909, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 458.5625, "epoch": 0.13144312160209207, "grad_norm": 0.0, "kl": 0.5625, "learning_rate": 9.579723252113585e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1910, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 454.953125, "epoch": 0.13151193999036542, "grad_norm": 0.0, "kl": 0.59375, "learning_rate": 9.579289336108802e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1911, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 489.515625, "epoch": 0.13158075837863878, "grad_norm": 0.9377841201950619, "kl": 0.6640625, "learning_rate": 9.578855206058163e-07, "loss": -0.0, "reward": 2.3312501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1912, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 474.8125, "epoch": 0.1316495767669121, "grad_norm": 0.5424042446838134, "kl": 0.58984375, "learning_rate": 9.578420861981963e-07, "loss": 0.0, "reward": 2.4156250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1913, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 492.828125, "epoch": 0.13171839515518546, "grad_norm": 0.388579515645407, "kl": 0.546875, "learning_rate": 9.5779863039005e-07, "loss": 0.0, "reward": 1.8253073692321777, "reward_std": 0.002439495176076889, "rewards/accuracy_reward": 0.6753072738647461, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1914, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 536.0, "epoch": 0.13178721354345882, "grad_norm": 1.9249990067021687, "kl": 0.5390625, "learning_rate": 9.57755153183409e-07, "loss": 0.0, "reward": 2.525632381439209, "reward_std": 0.09661959111690521, "rewards/accuracy_reward": 0.8521947264671326, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4921875, "step": 1915, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 470.421875, "epoch": 0.13185603193173215, "grad_norm": 0.46568143546384494, "kl": 0.55078125, "learning_rate": 9.577116545803053e-07, "loss": 0.0, "reward": 2.3625001907348633, "reward_std": 0.06943651288747787, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1916, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 481.25, "epoch": 0.1319248503200055, "grad_norm": 1.579057492637908, "kl": 0.55078125, "learning_rate": 9.57668134582772e-07, "loss": 0.0, "reward": 1.815904140472412, "reward_std": 0.07851164788007736, "rewards/accuracy_reward": 0.6846539974212646, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1917, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 494.265625, "epoch": 0.13199366870827886, "grad_norm": 0.28780501664732167, "kl": 0.65234375, "learning_rate": 9.576245931928437e-07, "loss": 0.0, "reward": 2.453125, "reward_std": 0.09722718596458435, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.4375, "step": 1918, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 478.546875, "epoch": 0.1320624870965522, "grad_norm": 0.0, "kl": 0.57421875, "learning_rate": 9.575810304125554e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1919, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 535.3125, "epoch": 0.13213130548482555, "grad_norm": 0.9341409277589569, "kl": 0.50390625, "learning_rate": 9.57537446243943e-07, "loss": -0.0, "reward": 2.428877830505371, "reward_std": 0.16586540639400482, "rewards/accuracy_reward": 0.7507528066635132, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1920, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 457.875, "epoch": 0.1322001238730989, "grad_norm": 0.528333021772185, "kl": 0.578125, "learning_rate": 9.574938406890442e-07, "loss": 0.0, "reward": 2.6812500953674316, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1921, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 481.984375, "epoch": 0.13226894226137223, "grad_norm": 2.077568560318124, "kl": 0.640625, "learning_rate": 9.574502137498972e-07, "loss": 0.0, "reward": 2.0159716606140137, "reward_std": 0.16141727566719055, "rewards/accuracy_reward": 0.5753465294837952, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.3125, "step": 1922, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 482.421875, "epoch": 0.13233776064964559, "grad_norm": 5.1053730973257565, "kl": 0.55859375, "learning_rate": 9.574065654285408e-07, "loss": -0.0, "reward": 1.6952545642852783, "reward_std": 0.11448792368173599, "rewards/accuracy_reward": 0.5640046000480652, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1923, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 487.796875, "epoch": 0.13240657903791894, "grad_norm": 0.0, "kl": 0.54296875, "learning_rate": 9.573628957270157e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1924, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 494.71875, "epoch": 0.13247539742619227, "grad_norm": 0.8355774070186271, "kl": 0.57421875, "learning_rate": 9.573192046473628e-07, "loss": 0.0, "reward": 2.106250047683716, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.921875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1925, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 467.828125, "epoch": 0.13254421581446563, "grad_norm": 1.043648450205054, "kl": 0.59375, "learning_rate": 9.572754921916244e-07, "loss": -0.0, "reward": 2.03125, "reward_std": 0.14706888794898987, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1926, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.0, "completion_length": 499.09375, "epoch": 0.13261303420273898, "grad_norm": 10.272688914392118, "kl": 0.58203125, "learning_rate": 9.572317583618437e-07, "loss": -0.0, "reward": 2.0843749046325684, "reward_std": 0.19126306474208832, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1927, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 482.09375, "epoch": 0.1326818525910123, "grad_norm": 0.8650978655295709, "kl": 0.58984375, "learning_rate": 9.57188003160065e-07, "loss": -0.0, "reward": 2.1437501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1928, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 468.78125, "epoch": 0.13275067097928567, "grad_norm": 0.0, "kl": 0.5859375, "learning_rate": 9.571442265883335e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1929, "temperature": 1.0 }, { "all_correct": 0.5, "all_wrong": 0.375, "completion_length": 482.734375, "epoch": 0.13281948936755902, "grad_norm": 0.8978084241466945, "kl": 0.5546875, "learning_rate": 9.571004286486953e-07, "loss": -0.0, "reward": 1.7177534103393555, "reward_std": 0.018072593957185745, "rewards/accuracy_reward": 0.5927532911300659, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1930, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 488.921875, "epoch": 0.13288830775583235, "grad_norm": 0.7608038703584533, "kl": 0.55078125, "learning_rate": 9.570566093431976e-07, "loss": -0.0, "reward": 2.265625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1931, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 485.71875, "epoch": 0.1329571261441057, "grad_norm": 1.0950414786452574, "kl": 0.57421875, "learning_rate": 9.570127686738887e-07, "loss": -0.0, "reward": 2.1895012855529785, "reward_std": 0.11455954611301422, "rewards/accuracy_reward": 0.6832512617111206, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.375, "step": 1932, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 484.34375, "epoch": 0.13302594453237906, "grad_norm": 1.622871918479164, "kl": 0.52734375, "learning_rate": 9.56968906642818e-07, "loss": 0.0, "reward": 2.6004624366760254, "reward_std": 0.05629141628742218, "rewards/accuracy_reward": 0.9035874009132385, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1933, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 462.09375, "epoch": 0.1330947629206524, "grad_norm": 2.6160464589001937, "kl": 0.5703125, "learning_rate": 9.569250232520352e-07, "loss": -0.0, "reward": 2.012500047683716, "reward_std": 0.1496148705482483, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1934, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 486.5625, "epoch": 0.13316358130892575, "grad_norm": 0.5883121285765539, "kl": 0.578125, "learning_rate": 9.568811185035918e-07, "loss": -0.0, "reward": 2.1437501907348633, "reward_std": 0.07763238251209259, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1935, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 491.421875, "epoch": 0.1332323996971991, "grad_norm": 0.8292187970807877, "kl": 0.55078125, "learning_rate": 9.568371923995403e-07, "loss": -0.0, "reward": 2.022624969482422, "reward_std": 0.1499827802181244, "rewards/accuracy_reward": 0.8538748621940613, "rewards/format_reward": 0.984375, "rewards/transform_reward": 0.0, "step": 1936, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 463.125, "epoch": 0.13330121808547243, "grad_norm": 0.0, "kl": 0.59765625, "learning_rate": 9.567932449419334e-07, "loss": 0.0, "reward": 2.0250000953674316, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.125, "step": 1937, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 470.578125, "epoch": 0.1333700364737458, "grad_norm": 0.7138882946842736, "kl": 0.5703125, "learning_rate": 9.567492761328255e-07, "loss": -0.0, "reward": 2.2052371501922607, "reward_std": 0.004137344192713499, "rewards/accuracy_reward": 0.8177371025085449, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1875, "step": 1938, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 486.96875, "epoch": 0.13343885486201912, "grad_norm": 0.897986216773057, "kl": 0.58984375, "learning_rate": 9.56705285974272e-07, "loss": 0.0, "reward": 1.9562500715255737, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1939, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 447.890625, "epoch": 0.13350767325029247, "grad_norm": 0.0, "kl": 0.6796875, "learning_rate": 9.566612744683288e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1940, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 505.390625, "epoch": 0.13357649163856583, "grad_norm": 0.7938156191034259, "kl": 0.578125, "learning_rate": 9.56617241617053e-07, "loss": -0.0, "reward": 2.2309272289276123, "reward_std": 0.0050917137414216995, "rewards/accuracy_reward": 0.5809271931648254, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1941, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 515.734375, "epoch": 0.13364531002683916, "grad_norm": 0.9889991116196417, "kl": 0.58203125, "learning_rate": 9.565731874225031e-07, "loss": 0.0, "reward": 2.4405031204223633, "reward_std": 0.07852017879486084, "rewards/accuracy_reward": 0.7592531442642212, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1942, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 517.390625, "epoch": 0.13371412841511252, "grad_norm": 1.1057876879968285, "kl": 0.5546875, "learning_rate": 9.56529111886738e-07, "loss": 0.0, "reward": 1.715265154838562, "reward_std": 0.09573376923799515, "rewards/accuracy_reward": 0.5558901429176331, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1943, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 478.171875, "epoch": 0.13378294680338587, "grad_norm": 2.434731711330464, "kl": 0.609375, "learning_rate": 9.564850150118183e-07, "loss": 0.0, "reward": 2.6812500953674316, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.984375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1944, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 469.84375, "epoch": 0.1338517651916592, "grad_norm": 0.0, "kl": 0.68359375, "learning_rate": 9.564408967998047e-07, "loss": 0.0, "reward": 2.2375001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.1875, "step": 1945, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 472.03125, "epoch": 0.13392058357993256, "grad_norm": 4.867286594583949, "kl": 0.6171875, "learning_rate": 9.563967572527598e-07, "loss": 0.0, "reward": 2.440258741378784, "reward_std": 0.06786113977432251, "rewards/accuracy_reward": 0.7683836221694946, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1946, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 506.296875, "epoch": 0.1339894019682059, "grad_norm": 0.8573886050549939, "kl": 0.58203125, "learning_rate": 9.563525963727466e-07, "loss": 0.0, "reward": 1.8191556930541992, "reward_std": 0.001789925037883222, "rewards/accuracy_reward": 0.6691555976867676, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1947, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 446.734375, "epoch": 0.13405822035647924, "grad_norm": 0.5808696078693776, "kl": 0.6328125, "learning_rate": 9.563084141618291e-07, "loss": 0.0, "reward": 2.2604167461395264, "reward_std": 0.019287923350930214, "rewards/accuracy_reward": 0.6291666626930237, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1948, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.25, "completion_length": 491.8125, "epoch": 0.1341270387447526, "grad_norm": 0.5132799399351295, "kl": 0.59765625, "learning_rate": 9.56264210622073e-07, "loss": -0.0, "reward": 1.832019567489624, "reward_std": 0.0023644883185625076, "rewards/accuracy_reward": 0.6820197105407715, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1949, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 462.578125, "epoch": 0.13419585713302595, "grad_norm": 8.556486022681414, "kl": 0.625, "learning_rate": 9.562199857555437e-07, "loss": 0.0, "reward": 2.136751651763916, "reward_std": 0.12246951460838318, "rewards/accuracy_reward": 0.9461264610290527, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1950, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 466.15625, "epoch": 0.13426467552129928, "grad_norm": 0.0, "kl": 0.6640625, "learning_rate": 9.56175739564309e-07, "loss": 0.0, "reward": 2.1192030906677246, "reward_std": 0.0, "rewards/accuracy_reward": 0.9192029237747192, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1951, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 454.6875, "epoch": 0.13433349390957264, "grad_norm": 1.0145946965539325, "kl": 0.6875, "learning_rate": 9.561314720504369e-07, "loss": -0.0, "reward": 1.9937500953674316, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1952, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 496.515625, "epoch": 0.134402312297846, "grad_norm": 0.9556421849617827, "kl": 0.57421875, "learning_rate": 9.560871832159962e-07, "loss": 0.0, "reward": 2.012500047683716, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1953, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 465.078125, "epoch": 0.13447113068611932, "grad_norm": 0.0, "kl": 0.59765625, "learning_rate": 9.560428730630574e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1954, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 483.734375, "epoch": 0.13453994907439268, "grad_norm": 0.0, "kl": 0.625, "learning_rate": 9.559985415936917e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1955, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 458.890625, "epoch": 0.13460876746266603, "grad_norm": 0.0, "kl": 0.62890625, "learning_rate": 9.55954188809971e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1956, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 444.859375, "epoch": 0.13467758585093936, "grad_norm": 2.7950658245119557, "kl": 0.609375, "learning_rate": 9.55909814713969e-07, "loss": 0.0, "reward": 2.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1957, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 458.4375, "epoch": 0.13474640423921272, "grad_norm": 1.2605289699452065, "kl": 0.6484375, "learning_rate": 9.55865419307759e-07, "loss": -0.0, "reward": 2.1498777866363525, "reward_std": 0.04984904080629349, "rewards/accuracy_reward": 0.6873778104782104, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3125, "step": 1958, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 458.125, "epoch": 0.13481522262748608, "grad_norm": 1.6004021056215079, "kl": 0.56640625, "learning_rate": 9.558210025934169e-07, "loss": -0.0, "reward": 1.875020980834961, "reward_std": 0.0802328959107399, "rewards/accuracy_reward": 0.7187710404396057, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1959, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 449.71875, "epoch": 0.1348840410157594, "grad_norm": 2.297965175916415, "kl": 0.63671875, "learning_rate": 9.557765645730183e-07, "loss": -0.0, "reward": 2.3390626907348633, "reward_std": 0.03234682232141495, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.2890625, "step": 1960, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 462.796875, "epoch": 0.13495285940403276, "grad_norm": 1.19401322512334, "kl": 0.5546875, "learning_rate": 9.557321052486407e-07, "loss": 0.0, "reward": 2.4749999046325684, "reward_std": 0.1306653916835785, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1961, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 496.234375, "epoch": 0.13502167779230612, "grad_norm": 1.412507057634039, "kl": 0.486328125, "learning_rate": 9.55687624622362e-07, "loss": 0.0, "reward": 2.5510993003845215, "reward_std": 0.005168276838958263, "rewards/accuracy_reward": 0.8510991334915161, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1962, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 468.890625, "epoch": 0.13509049618057944, "grad_norm": 0.6212700677017353, "kl": 0.54296875, "learning_rate": 9.556431226962612e-07, "loss": 0.0, "reward": 2.4290030002593994, "reward_std": 0.013118593953549862, "rewards/accuracy_reward": 0.7540029883384705, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1963, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 452.09375, "epoch": 0.1351593145688528, "grad_norm": 0.6097199750412782, "kl": 0.53125, "learning_rate": 9.555985994724188e-07, "loss": -0.0, "reward": 2.150845527648926, "reward_std": 0.00942226406186819, "rewards/accuracy_reward": 0.9508455395698547, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1964, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 437.484375, "epoch": 0.13522813295712616, "grad_norm": 0.0, "kl": 0.60546875, "learning_rate": 9.555540549529154e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1965, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 444.921875, "epoch": 0.13529695134539949, "grad_norm": 0.729387270831952, "kl": 0.53125, "learning_rate": 9.55509489139834e-07, "loss": 0.0, "reward": 2.0945024490356445, "reward_std": 0.001923822332173586, "rewards/accuracy_reward": 0.8945023417472839, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1966, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 469.046875, "epoch": 0.13536576973367284, "grad_norm": 8.684117715416233, "kl": 0.60546875, "learning_rate": 9.554649020352566e-07, "loss": 0.0, "reward": 2.3101463317871094, "reward_std": 0.23004651069641113, "rewards/accuracy_reward": 0.8476463556289673, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.3125, "step": 1967, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.125, "completion_length": 452.8125, "epoch": 0.1354345881219462, "grad_norm": 0.0, "kl": 0.53125, "learning_rate": 9.55420293641268e-07, "loss": 0.0, "reward": 2.0500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1968, "temperature": 1.0 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 473.140625, "epoch": 0.13550340651021953, "grad_norm": 1.1844559044813625, "kl": 0.490234375, "learning_rate": 9.553756639599531e-07, "loss": -0.0, "reward": 1.7216273546218872, "reward_std": 0.08082813024520874, "rewards/accuracy_reward": 0.5778773427009583, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1969, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 453.390625, "epoch": 0.13557222489849288, "grad_norm": 0.0, "kl": 0.64453125, "learning_rate": 9.55331012993398e-07, "loss": 0.0, "reward": 2.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1970, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 469.8125, "epoch": 0.1356410432867662, "grad_norm": 0.0, "kl": 0.52734375, "learning_rate": 9.552863407436896e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1971, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 483.171875, "epoch": 0.13570986167503957, "grad_norm": 1.0632925160340532, "kl": 0.5078125, "learning_rate": 9.552416472129163e-07, "loss": 0.0, "reward": 2.0169835090637207, "reward_std": 0.0711846649646759, "rewards/accuracy_reward": 0.8357334136962891, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1972, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 419.53125, "epoch": 0.13577868006331292, "grad_norm": 0.0, "kl": 0.6796875, "learning_rate": 9.55196932403167e-07, "loss": 0.0, "reward": 2.575000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1973, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 412.171875, "epoch": 0.13584749845158625, "grad_norm": 0.0, "kl": 0.67578125, "learning_rate": 9.55152196316532e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1974, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 435.328125, "epoch": 0.1359163168398596, "grad_norm": 0.42319379223501846, "kl": 0.546875, "learning_rate": 9.55107438955102e-07, "loss": 0.0, "reward": 2.03125, "reward_std": 0.0530330091714859, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1975, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 440.640625, "epoch": 0.13598513522813296, "grad_norm": 0.0, "kl": 0.66015625, "learning_rate": 9.55062660320969e-07, "loss": 0.0, "reward": 2.3000001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.25, "step": 1976, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 477.09375, "epoch": 0.1360539536164063, "grad_norm": 1.5467206473794999, "kl": 0.5234375, "learning_rate": 9.550178604162264e-07, "loss": -0.0, "reward": 2.028165578842163, "reward_std": 0.08033653348684311, "rewards/accuracy_reward": 0.8437905311584473, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1977, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 448.015625, "epoch": 0.13612277200467965, "grad_norm": 0.701650935228665, "kl": 0.5234375, "learning_rate": 9.549730392429681e-07, "loss": 0.0, "reward": 1.952553629875183, "reward_std": 0.05489245802164078, "rewards/accuracy_reward": 0.7806785106658936, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1978, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 447.625, "epoch": 0.136191590392953, "grad_norm": 0.9007398260921852, "kl": 0.56640625, "learning_rate": 9.549281968032892e-07, "loss": 0.0, "reward": 2.53125, "reward_std": 0.0530330128967762, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1979, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 443.0625, "epoch": 0.13626040878122633, "grad_norm": 0.8086443371557838, "kl": 0.53515625, "learning_rate": 9.548833330992856e-07, "loss": -0.0, "reward": 1.9651836156845093, "reward_std": 0.05670703947544098, "rewards/accuracy_reward": 0.7933087348937988, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1980, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 445.921875, "epoch": 0.1363292271694997, "grad_norm": 0.5182525714053641, "kl": 0.5546875, "learning_rate": 9.548384481330546e-07, "loss": -0.0, "reward": 1.9729909896850586, "reward_std": 0.0021598830353468657, "rewards/accuracy_reward": 0.7979909777641296, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1981, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 499.21875, "epoch": 0.13639804555777305, "grad_norm": 4.181067733122379, "kl": 0.62890625, "learning_rate": 9.547935419066936e-07, "loss": 0.0, "reward": 2.185694694519043, "reward_std": 0.21966685354709625, "rewards/accuracy_reward": 0.6825695633888245, "rewards/format_reward": 0.96875, "rewards/transform_reward": 0.375, "step": 1982, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 446.5, "epoch": 0.13646686394604637, "grad_norm": 0.542704011429418, "kl": 0.65234375, "learning_rate": 9.547486144223026e-07, "loss": -0.0, "reward": 2.1202898025512695, "reward_std": 0.002504644449800253, "rewards/accuracy_reward": 0.9202895760536194, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1983, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 442.09375, "epoch": 0.13653568233431973, "grad_norm": 4.884303523740454, "kl": 0.56640625, "learning_rate": 9.547036656819805e-07, "loss": 0.0, "reward": 2.012500047683716, "reward_std": 0.06943650543689728, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1984, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 432.453125, "epoch": 0.1366045007225931, "grad_norm": 0.0, "kl": 0.671875, "learning_rate": 9.54658695687829e-07, "loss": 0.0, "reward": 2.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3125, "step": 1985, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.125, "completion_length": 452.46875, "epoch": 0.13667331911086641, "grad_norm": 0.7046262014928899, "kl": 0.53125, "learning_rate": 9.546137044419501e-07, "loss": -0.0, "reward": 1.9937500953674316, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1986, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 431.828125, "epoch": 0.13674213749913977, "grad_norm": 4.39198919233476, "kl": 0.671875, "learning_rate": 9.545686919464466e-07, "loss": 0.0, "reward": 2.15625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1987, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 448.234375, "epoch": 0.13681095588741313, "grad_norm": 0.761555879036494, "kl": 0.5546875, "learning_rate": 9.545236582034224e-07, "loss": -0.0, "reward": 2.1437501907348633, "reward_std": 0.077632375061512, "rewards/accuracy_reward": 0.953125, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1988, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 467.5625, "epoch": 0.13687977427568646, "grad_norm": 0.0, "kl": 0.58203125, "learning_rate": 9.544786032149826e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1989, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 431.171875, "epoch": 0.1369485926639598, "grad_norm": 0.9026945867082408, "kl": 0.77734375, "learning_rate": 9.54433526983233e-07, "loss": 0.0, "reward": 2.3260185718536377, "reward_std": 0.005943531636148691, "rewards/accuracy_reward": 0.838518500328064, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3125, "step": 1990, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.125, "completion_length": 437.96875, "epoch": 0.13701741105223317, "grad_norm": 0.8508280913186413, "kl": 0.5703125, "learning_rate": 9.54388429510281e-07, "loss": 0.0, "reward": 1.9672536849975586, "reward_std": 0.0800764337182045, "rewards/accuracy_reward": 0.798503577709198, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1991, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 423.34375, "epoch": 0.1370862294405065, "grad_norm": 0.6857428627892334, "kl": 0.69921875, "learning_rate": 9.543433107982342e-07, "loss": -0.0, "reward": 2.2906250953674316, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.375, "step": 1992, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 451.46875, "epoch": 0.13715504782877985, "grad_norm": 0.0, "kl": 0.5703125, "learning_rate": 9.542981708492014e-07, "loss": 0.0, "reward": 2.5500001907348633, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1993, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 486.15625, "epoch": 0.1372238662170532, "grad_norm": 0.603935155818105, "kl": 0.5, "learning_rate": 9.54253009665293e-07, "loss": -0.0, "reward": 2.3349108695983887, "reward_std": 0.008950523100793362, "rewards/accuracy_reward": 0.6849108338356018, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1994, "temperature": 1.0 }, { "all_correct": 0.25, "all_wrong": 0.5, "completion_length": 481.375, "epoch": 0.13729268460532654, "grad_norm": 1.3343311491853935, "kl": 0.515625, "learning_rate": 9.542078272486194e-07, "loss": -0.0, "reward": 1.3906116485595703, "reward_std": 0.04630954563617706, "rewards/accuracy_reward": 0.31561169028282166, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1995, "temperature": 1.0 }, { "all_correct": 0.875, "all_wrong": 0.0, "completion_length": 471.109375, "epoch": 0.1373615029935999, "grad_norm": 0.699926072360247, "kl": 0.546875, "learning_rate": 9.54162623601293e-07, "loss": -0.0, "reward": 2.14310622215271, "reward_std": 0.005553164519369602, "rewards/accuracy_reward": 0.9431061744689941, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1996, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 493.09375, "epoch": 0.13743032138187325, "grad_norm": 2.9183092555053496, "kl": 0.59765625, "learning_rate": 9.541173987254264e-07, "loss": 0.0, "reward": 2.5163869857788086, "reward_std": 0.03527873381972313, "rewards/accuracy_reward": 0.9179496169090271, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.3984375, "step": 1997, "temperature": 1.0 }, { "all_correct": 0.625, "all_wrong": 0.0, "completion_length": 451.265625, "epoch": 0.13749913977014658, "grad_norm": 0.5034595346758733, "kl": 0.54296875, "learning_rate": 9.540721526231339e-07, "loss": 0.0, "reward": 2.265625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.5, "step": 1998, "temperature": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 436.703125, "epoch": 0.13756795815841993, "grad_norm": 0.0, "kl": 0.58203125, "learning_rate": 9.540268852965298e-07, "loss": 0.0, "reward": 2.200000047683716, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 1999, "temperature": 1.0 }, { "all_correct": 0.75, "all_wrong": 0.0, "completion_length": 434.828125, "epoch": 0.1376367765466933, "grad_norm": 0.8598806236434122, "kl": 0.57421875, "learning_rate": 9.539815967477304e-07, "loss": 0.0, "reward": 2.173584461212158, "reward_std": 0.07471458613872528, "rewards/accuracy_reward": 0.9767093658447266, "rewards/format_reward": 1.0, "rewards/transform_reward": 0.0, "step": 2000, "temperature": 1.0 } ], "logging_steps": 1.0, "max_steps": 14531, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }