{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9640831758034026, "eval_steps": 500, "global_step": 510, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "all_correct": 0.0625, "all_wrong": 0.0625, "completion_length": 95.96875, "epoch": 0.001890359168241966, "grad_norm": 2.3602562531627758, "kl": 0.0, "learning_rate": 1.999982365744487e-06, "loss": 0.0, "reward": 1.514066219329834, "reward_std": 0.375105082988739, "rewards/accuracy_reward": 0.5140663385391235, "rewards/format_reward": 1.0, "step": 1 }, { "all_correct": 0.0625, "all_wrong": 0.09375, "completion_length": 92.3515625, "epoch": 0.003780718336483932, "grad_norm": 6.640343536616842, "kl": 0.0013275146484375, "learning_rate": 1.999929463599883e-06, "loss": 0.0001, "reward": 1.417905330657959, "reward_std": 0.37063267827033997, "rewards/accuracy_reward": 0.42181164026260376, "rewards/format_reward": 0.99609375, "step": 2 }, { "all_correct": 0.0625, "all_wrong": 0.125, "completion_length": 90.35546875, "epoch": 0.005671077504725898, "grad_norm": 2.414094297005193, "kl": 0.00127410888671875, "learning_rate": 1.9998412954319676e-06, "loss": 0.0001, "reward": 1.4140625, "reward_std": 0.3621976673603058, "rewards/accuracy_reward": 0.4140625, "rewards/format_reward": 1.0, "step": 3 }, { "all_correct": 0.09375, "all_wrong": 0.1875, "completion_length": 92.7734375, "epoch": 0.007561436672967864, "grad_norm": 2.2244626207569187, "kl": 0.00150299072265625, "learning_rate": 1.9997178643503e-06, "loss": 0.0001, "reward": 1.408446192741394, "reward_std": 0.3336002826690674, "rewards/accuracy_reward": 0.40844619274139404, "rewards/format_reward": 1.0, "step": 4 }, { "all_correct": 0.125, "all_wrong": 0.0625, "completion_length": 89.6640625, "epoch": 0.00945179584120983, "grad_norm": 2.3915039557721607, "kl": 0.0021209716796875, "learning_rate": 1.999559174708112e-06, "loss": 0.0001, "reward": 1.5034363269805908, "reward_std": 0.37499552965164185, "rewards/accuracy_reward": 0.5073425769805908, "rewards/format_reward": 0.99609375, "step": 5 }, { "all_correct": 0.03125, "all_wrong": 0.09375, "completion_length": 88.1875, "epoch": 0.011342155009451797, "grad_norm": 4.082221628547588, "kl": 0.00335693359375, "learning_rate": 1.99936523210215e-06, "loss": 0.0001, "reward": 1.4816137552261353, "reward_std": 0.3769935369491577, "rewards/accuracy_reward": 0.48161375522613525, "rewards/format_reward": 1.0, "step": 6 }, { "all_correct": 0.15625, "all_wrong": 0.03125, "completion_length": 84.96875, "epoch": 0.013232514177693762, "grad_norm": 2.411737580254065, "kl": 0.00433349609375, "learning_rate": 1.999136043372481e-06, "loss": 0.0002, "reward": 1.6246747970581055, "reward_std": 0.3329503536224365, "rewards/accuracy_reward": 0.6246747970581055, "rewards/format_reward": 1.0, "step": 7 }, { "all_correct": 0.09375, "all_wrong": 0.125, "completion_length": 86.140625, "epoch": 0.015122873345935728, "grad_norm": 2.5943017844391107, "kl": 0.00537109375, "learning_rate": 1.9988716166022506e-06, "loss": 0.0002, "reward": 1.4784858226776123, "reward_std": 0.3560720384120941, "rewards/accuracy_reward": 0.4784858524799347, "rewards/format_reward": 1.0, "step": 8 }, { "all_correct": 0.0625, "all_wrong": 0.09375, "completion_length": 82.1328125, "epoch": 0.017013232514177693, "grad_norm": 2.1704796065424508, "kl": 0.00628662109375, "learning_rate": 1.998571961117397e-06, "loss": 0.0003, "reward": 1.4728260040283203, "reward_std": 0.3873444199562073, "rewards/accuracy_reward": 0.47282594442367554, "rewards/format_reward": 1.0, "step": 9 }, { "all_correct": 0.125, "all_wrong": 0.09375, "completion_length": 90.22265625, "epoch": 0.01890359168241966, "grad_norm": 1.9312909564522398, "kl": 0.0068359375, "learning_rate": 1.9982370874863233e-06, "loss": 0.0003, "reward": 1.5304478406906128, "reward_std": 0.3415384888648987, "rewards/accuracy_reward": 0.5343540906906128, "rewards/format_reward": 0.99609375, "step": 10 }, { "all_correct": 0.09375, "all_wrong": 0.0625, "completion_length": 99.89453125, "epoch": 0.020793950850661626, "grad_norm": 2.183103416981468, "kl": 0.0074462890625, "learning_rate": 1.9978670075195237e-06, "loss": 0.0003, "reward": 1.5692423582077026, "reward_std": 0.39161738753318787, "rewards/accuracy_reward": 0.5848673582077026, "rewards/format_reward": 0.984375, "step": 11 }, { "all_correct": 0.0, "all_wrong": 0.0, "completion_length": 93.22265625, "epoch": 0.022684310018903593, "grad_norm": 8.227068776940921, "kl": 0.00958251953125, "learning_rate": 1.9974617342691674e-06, "loss": 0.0004, "reward": 1.468308687210083, "reward_std": 0.46158915758132935, "rewards/accuracy_reward": 0.47612112760543823, "rewards/format_reward": 0.9921875, "step": 12 }, { "all_correct": 0.09375, "all_wrong": 0.0625, "completion_length": 87.81640625, "epoch": 0.024574669187145556, "grad_norm": 3.223495772387027, "kl": 0.01129150390625, "learning_rate": 1.997021282028639e-06, "loss": 0.0005, "reward": 1.5035626888275146, "reward_std": 0.3581737279891968, "rewards/accuracy_reward": 0.5035626888275146, "rewards/format_reward": 1.0, "step": 13 }, { "all_correct": 0.21875, "all_wrong": 0.0625, "completion_length": 88.83984375, "epoch": 0.026465028355387523, "grad_norm": 3.1552889851956594, "kl": 0.0126953125, "learning_rate": 1.9965456663320324e-06, "loss": 0.0005, "reward": 1.507861852645874, "reward_std": 0.294893354177475, "rewards/accuracy_reward": 0.507861852645874, "rewards/format_reward": 1.0, "step": 14 }, { "all_correct": 0.03125, "all_wrong": 0.125, "completion_length": 92.70703125, "epoch": 0.02835538752362949, "grad_norm": 2.6633455753050947, "kl": 0.011962890625, "learning_rate": 1.996034903953606e-06, "loss": 0.0005, "reward": 1.403101921081543, "reward_std": 0.35901227593421936, "rewards/accuracy_reward": 0.4031018912792206, "rewards/format_reward": 1.0, "step": 15 }, { "all_correct": 0.0625, "all_wrong": 0.0, "completion_length": 79.59765625, "epoch": 0.030245746691871456, "grad_norm": 4.571581966102541, "kl": 0.0162353515625, "learning_rate": 1.9954890129071873e-06, "loss": 0.0007, "reward": 1.5757702589035034, "reward_std": 0.3923387825489044, "rewards/accuracy_reward": 0.5757702589035034, "rewards/format_reward": 1.0, "step": 16 }, { "all_correct": 0.09375, "all_wrong": 0.03125, "completion_length": 82.7890625, "epoch": 0.03213610586011342, "grad_norm": 2.393421904457873, "kl": 0.01361083984375, "learning_rate": 1.9949080124455415e-06, "loss": 0.0005, "reward": 1.5704209804534912, "reward_std": 0.3697139024734497, "rewards/accuracy_reward": 0.5704209804534912, "rewards/format_reward": 1.0, "step": 17 }, { "all_correct": 0.15625, "all_wrong": 0.125, "completion_length": 88.5234375, "epoch": 0.034026465028355386, "grad_norm": 2.8917838645260643, "kl": 0.0155029296875, "learning_rate": 1.9942919230596897e-06, "loss": 0.0006, "reward": 1.5118800401687622, "reward_std": 0.3223443031311035, "rewards/accuracy_reward": 0.5196925401687622, "rewards/format_reward": 0.9921875, "step": 18 }, { "all_correct": 0.0625, "all_wrong": 0.09375, "completion_length": 84.2421875, "epoch": 0.035916824196597356, "grad_norm": 5.108847500395768, "kl": 0.0186767578125, "learning_rate": 1.9936407664781867e-06, "loss": 0.0007, "reward": 1.5046335458755493, "reward_std": 0.3630630373954773, "rewards/accuracy_reward": 0.5046335458755493, "rewards/format_reward": 1.0, "step": 19 }, { "all_correct": 0.125, "all_wrong": 0.09375, "completion_length": 84.87890625, "epoch": 0.03780718336483932, "grad_norm": 2.1872918469896065, "kl": 0.0181884765625, "learning_rate": 1.992954565666356e-06, "loss": 0.0007, "reward": 1.5272603034973145, "reward_std": 0.32879531383514404, "rewards/accuracy_reward": 0.5350728034973145, "rewards/format_reward": 0.9921875, "step": 20 }, { "all_correct": 0.09375, "all_wrong": 0.0625, "completion_length": 79.5859375, "epoch": 0.03969754253308128, "grad_norm": 3.3985079753937026, "kl": 0.0186767578125, "learning_rate": 1.9922333448254785e-06, "loss": 0.0007, "reward": 1.4413049221038818, "reward_std": 0.3449176847934723, "rewards/accuracy_reward": 0.44130486249923706, "rewards/format_reward": 1.0, "step": 21 }, { "all_correct": 0.09375, "all_wrong": 0.0625, "completion_length": 87.44921875, "epoch": 0.04158790170132325, "grad_norm": 2.1124500662245302, "kl": 0.0181884765625, "learning_rate": 1.9914771293919394e-06, "loss": 0.0007, "reward": 1.5039958953857422, "reward_std": 0.36595243215560913, "rewards/accuracy_reward": 0.5039960145950317, "rewards/format_reward": 1.0, "step": 22 }, { "all_correct": 0.125, "all_wrong": 0.1875, "completion_length": 90.96484375, "epoch": 0.043478260869565216, "grad_norm": 2.57988777327046, "kl": 0.020263671875, "learning_rate": 1.9906859460363304e-06, "loss": 0.0008, "reward": 1.406567931175232, "reward_std": 0.32273876667022705, "rewards/accuracy_reward": 0.41828668117523193, "rewards/format_reward": 0.98828125, "step": 23 }, { "all_correct": 0.125, "all_wrong": 0.0625, "completion_length": 81.42578125, "epoch": 0.045368620037807186, "grad_norm": 2.877431959109021, "kl": 0.0211181640625, "learning_rate": 1.9898598226625114e-06, "loss": 0.0008, "reward": 1.5652744770050049, "reward_std": 0.33840304613113403, "rewards/accuracy_reward": 0.5652744770050049, "rewards/format_reward": 1.0, "step": 24 }, { "all_correct": 0.0, "all_wrong": 0.0625, "completion_length": 82.98046875, "epoch": 0.04725897920604915, "grad_norm": 2.669500830918649, "kl": 0.01953125, "learning_rate": 1.9889987884066234e-06, "loss": 0.0008, "reward": 1.457775354385376, "reward_std": 0.3964886963367462, "rewards/accuracy_reward": 0.4577752947807312, "rewards/format_reward": 1.0, "step": 25 }, { "all_correct": 0.03125, "all_wrong": 0.125, "completion_length": 90.69921875, "epoch": 0.04914933837429111, "grad_norm": 2.1403683676993577, "kl": 0.01806640625, "learning_rate": 1.9881028736360623e-06, "loss": 0.0007, "reward": 1.3968093395233154, "reward_std": 0.3421282172203064, "rewards/accuracy_reward": 0.3968093991279602, "rewards/format_reward": 1.0, "step": 26 }, { "all_correct": 0.0625, "all_wrong": 0.0625, "completion_length": 86.796875, "epoch": 0.05103969754253308, "grad_norm": 2.435390719204286, "kl": 0.0238037109375, "learning_rate": 1.9871721099484077e-06, "loss": 0.001, "reward": 1.4553546905517578, "reward_std": 0.3946911692619324, "rewards/accuracy_reward": 0.4553546607494354, "rewards/format_reward": 1.0, "step": 27 }, { "all_correct": 0.03125, "all_wrong": 0.09375, "completion_length": 89.609375, "epoch": 0.052930056710775046, "grad_norm": 2.8058570294434158, "kl": 0.0213623046875, "learning_rate": 1.98620653017031e-06, "loss": 0.0009, "reward": 1.4355332851409912, "reward_std": 0.3729744553565979, "rewards/accuracy_reward": 0.4394395351409912, "rewards/format_reward": 0.99609375, "step": 28 }, { "all_correct": 0.09375, "all_wrong": 0.0625, "completion_length": 89.26953125, "epoch": 0.054820415879017016, "grad_norm": 2.5095592118103562, "kl": 0.0234375, "learning_rate": 1.9852061683563294e-06, "loss": 0.0009, "reward": 1.4201900959014893, "reward_std": 0.3762331008911133, "rewards/accuracy_reward": 0.4280025064945221, "rewards/format_reward": 0.9921875, "step": 29 }, { "all_correct": 0.0, "all_wrong": 0.125, "completion_length": 89.44921875, "epoch": 0.05671077504725898, "grad_norm": 2.251874852478182, "kl": 0.02587890625, "learning_rate": 1.9841710597877382e-06, "loss": 0.001, "reward": 1.482748031616211, "reward_std": 0.37411996722221375, "rewards/accuracy_reward": 0.4827480912208557, "rewards/format_reward": 1.0, "step": 30 }, { "all_correct": 0.09375, "all_wrong": 0.125, "completion_length": 93.671875, "epoch": 0.05860113421550094, "grad_norm": 3.0094971920246465, "kl": 0.0240478515625, "learning_rate": 1.9831012409712736e-06, "loss": 0.001, "reward": 1.5294721126556396, "reward_std": 0.28737539052963257, "rewards/accuracy_reward": 0.5294721126556396, "rewards/format_reward": 1.0, "step": 31 }, { "all_correct": 0.125, "all_wrong": 0.0625, "completion_length": 89.1875, "epoch": 0.06049149338374291, "grad_norm": 2.085921014940215, "kl": 0.02294921875, "learning_rate": 1.981996749637853e-06, "loss": 0.0009, "reward": 1.4983799457550049, "reward_std": 0.3365086317062378, "rewards/accuracy_reward": 0.5100986957550049, "rewards/format_reward": 0.98828125, "step": 32 }, { "all_correct": 0.0625, "all_wrong": 0.09375, "completion_length": 90.39453125, "epoch": 0.062381852551984876, "grad_norm": 2.622758297904014, "kl": 0.0267333984375, "learning_rate": 1.9808576247412406e-06, "loss": 0.0011, "reward": 1.457702875137329, "reward_std": 0.397009938955307, "rewards/accuracy_reward": 0.4577029049396515, "rewards/format_reward": 1.0, "step": 33 }, { "all_correct": 0.09375, "all_wrong": 0.09375, "completion_length": 89.71875, "epoch": 0.06427221172022685, "grad_norm": 2.171724077831634, "kl": 0.0255126953125, "learning_rate": 1.979683906456676e-06, "loss": 0.001, "reward": 1.5610275268554688, "reward_std": 0.3770396411418915, "rewards/accuracy_reward": 0.5688400268554688, "rewards/format_reward": 0.9921875, "step": 34 }, { "all_correct": 0.03125, "all_wrong": 0.125, "completion_length": 102.4375, "epoch": 0.0661625708884688, "grad_norm": 2.4062787127917553, "kl": 0.0238037109375, "learning_rate": 1.9784756361794553e-06, "loss": 0.001, "reward": 1.4750714302062988, "reward_std": 0.39490264654159546, "rewards/accuracy_reward": 0.49069640040397644, "rewards/format_reward": 0.984375, "step": 35 }, { "all_correct": 0.0625, "all_wrong": 0.09375, "completion_length": 100.83203125, "epoch": 0.06805293005671077, "grad_norm": 1.923377019543097, "kl": 0.0269775390625, "learning_rate": 1.9772328565234715e-06, "loss": 0.0011, "reward": 1.453148603439331, "reward_std": 0.3995356857776642, "rewards/accuracy_reward": 0.46877366304397583, "rewards/format_reward": 0.984375, "step": 36 }, { "all_correct": 0.125, "all_wrong": 0.0625, "completion_length": 93.76953125, "epoch": 0.06994328922495274, "grad_norm": 2.5852022147016065, "kl": 0.033935546875, "learning_rate": 1.9759556113197133e-06, "loss": 0.0014, "reward": 1.5378497838974, "reward_std": 0.3647596836090088, "rewards/accuracy_reward": 0.5378497838973999, "rewards/format_reward": 1.0, "step": 37 }, { "all_correct": 0.09375, "all_wrong": 0.03125, "completion_length": 93.65625, "epoch": 0.07183364839319471, "grad_norm": 3.0899596065894617, "kl": 0.0311279296875, "learning_rate": 1.974643945614717e-06, "loss": 0.0012, "reward": 1.5544549226760864, "reward_std": 0.38801953196525574, "rewards/accuracy_reward": 0.5661737322807312, "rewards/format_reward": 0.98828125, "step": 38 }, { "all_correct": 0.09375, "all_wrong": 0.15625, "completion_length": 87.58203125, "epoch": 0.07372400756143667, "grad_norm": 2.210050723375974, "kl": 0.031982421875, "learning_rate": 1.973297905668979e-06, "loss": 0.0013, "reward": 1.4015624523162842, "reward_std": 0.32554763555526733, "rewards/accuracy_reward": 0.40546876192092896, "rewards/format_reward": 0.99609375, "step": 39 }, { "all_correct": 0.0, "all_wrong": 0.25, "completion_length": 96.12109375, "epoch": 0.07561436672967864, "grad_norm": 3.5246311894519393, "kl": 0.029052734375, "learning_rate": 1.971917538955324e-06, "loss": 0.0012, "reward": 1.3508296012878418, "reward_std": 0.28737950325012207, "rewards/accuracy_reward": 0.350829541683197, "rewards/format_reward": 1.0, "step": 40 }, { "all_correct": 0.1875, "all_wrong": 0.0625, "completion_length": 89.08984375, "epoch": 0.07750472589792061, "grad_norm": 2.4498338590518633, "kl": 0.03466796875, "learning_rate": 1.9705028941572306e-06, "loss": 0.0014, "reward": 1.5412009954452515, "reward_std": 0.28274005651474, "rewards/accuracy_reward": 0.5412009358406067, "rewards/format_reward": 1.0, "step": 41 }, { "all_correct": 0.09375, "all_wrong": 0.09375, "completion_length": 92.94921875, "epoch": 0.07939508506616257, "grad_norm": 2.3904089625119402, "kl": 0.03369140625, "learning_rate": 1.9690540211671144e-06, "loss": 0.0013, "reward": 1.5241187810897827, "reward_std": 0.3563092350959778, "rewards/accuracy_reward": 0.5241187810897827, "rewards/format_reward": 1.0, "step": 42 }, { "all_correct": 0.09375, "all_wrong": 0.09375, "completion_length": 94.828125, "epoch": 0.08128544423440454, "grad_norm": 1.9707044514802314, "kl": 0.036865234375, "learning_rate": 1.9675709710845685e-06, "loss": 0.0015, "reward": 1.506849765777588, "reward_std": 0.37673529982566833, "rewards/accuracy_reward": 0.5263809561729431, "rewards/format_reward": 0.98046875, "step": 43 }, { "all_correct": 0.09375, "all_wrong": 0.125, "completion_length": 95.9140625, "epoch": 0.0831758034026465, "grad_norm": 2.2780949740065326, "kl": 0.037109375, "learning_rate": 1.966053796214561e-06, "loss": 0.0015, "reward": 1.53428316116333, "reward_std": 0.30121245980262756, "rewards/accuracy_reward": 0.5342831015586853, "rewards/format_reward": 1.0, "step": 44 }, { "all_correct": 0.09375, "all_wrong": 0.0, "completion_length": 93.00390625, "epoch": 0.08506616257088846, "grad_norm": 2.5387336771715483, "kl": 0.033935546875, "learning_rate": 1.9645025500655906e-06, "loss": 0.0014, "reward": 1.6001973152160645, "reward_std": 0.3812434673309326, "rewards/accuracy_reward": 0.6001973152160645, "rewards/format_reward": 1.0, "step": 45 }, { "all_correct": 0.15625, "all_wrong": 0.125, "completion_length": 95.98046875, "epoch": 0.08695652173913043, "grad_norm": 1.9377592683430596, "kl": 0.0341796875, "learning_rate": 1.9629172873477994e-06, "loss": 0.0014, "reward": 1.5135773420333862, "reward_std": 0.31737208366394043, "rewards/accuracy_reward": 0.5174835920333862, "rewards/format_reward": 0.99609375, "step": 46 }, { "all_correct": 0.03125, "all_wrong": 0.125, "completion_length": 100.23828125, "epoch": 0.0888468809073724, "grad_norm": 2.869116193500619, "kl": 0.036865234375, "learning_rate": 1.9612980639710424e-06, "loss": 0.0015, "reward": 1.4429457187652588, "reward_std": 0.3532693684101105, "rewards/accuracy_reward": 0.4468519687652588, "rewards/format_reward": 0.99609375, "step": 47 }, { "all_correct": 0.09375, "all_wrong": 0.1875, "completion_length": 92.6640625, "epoch": 0.09073724007561437, "grad_norm": 2.613636479936795, "kl": 0.0361328125, "learning_rate": 1.959644937042918e-06, "loss": 0.0014, "reward": 1.4310598373413086, "reward_std": 0.3048115074634552, "rewards/accuracy_reward": 0.43887221813201904, "rewards/format_reward": 0.9921875, "step": 48 }, { "all_correct": 0.0625, "all_wrong": 0.0625, "completion_length": 94.671875, "epoch": 0.09262759924385633, "grad_norm": 2.3402731827939296, "kl": 0.0361328125, "learning_rate": 1.957957964866751e-06, "loss": 0.0014, "reward": 1.512986660003662, "reward_std": 0.3828140199184418, "rewards/accuracy_reward": 0.5168927907943726, "rewards/format_reward": 0.99609375, "step": 49 }, { "all_correct": 0.09375, "all_wrong": 0.0625, "completion_length": 98.546875, "epoch": 0.0945179584120983, "grad_norm": 2.244807148397354, "kl": 0.03857421875, "learning_rate": 1.956237206939538e-06, "loss": 0.0015, "reward": 1.47536301612854, "reward_std": 0.400837242603302, "rewards/accuracy_reward": 0.47926920652389526, "rewards/format_reward": 0.99609375, "step": 50 }, { "all_correct": 0.125, "all_wrong": 0.09375, "completion_length": 90.328125, "epoch": 0.09640831758034027, "grad_norm": 2.3971385643088348, "kl": 0.04248046875, "learning_rate": 1.9544827239498494e-06, "loss": 0.0017, "reward": 1.5585781335830688, "reward_std": 0.37756532430648804, "rewards/accuracy_reward": 0.5624843835830688, "rewards/format_reward": 0.99609375, "step": 51 }, { "all_correct": 0.15625, "all_wrong": 0.09375, "completion_length": 84.60546875, "epoch": 0.09829867674858223, "grad_norm": 2.2709012051170987, "kl": 0.038330078125, "learning_rate": 1.952694577775688e-06, "loss": 0.0015, "reward": 1.4947199821472168, "reward_std": 0.28436505794525146, "rewards/accuracy_reward": 0.4947200417518616, "rewards/format_reward": 1.0, "step": 52 }, { "all_correct": 0.1875, "all_wrong": 0.15625, "completion_length": 90.6953125, "epoch": 0.1001890359168242, "grad_norm": 2.3353621567696172, "kl": 0.040283203125, "learning_rate": 1.950872831482306e-06, "loss": 0.0016, "reward": 1.5374271869659424, "reward_std": 0.2822425663471222, "rewards/accuracy_reward": 0.5374271869659424, "rewards/format_reward": 1.0, "step": 53 }, { "all_correct": 0.15625, "all_wrong": 0.0625, "completion_length": 85.8828125, "epoch": 0.10207939508506617, "grad_norm": 2.686467954946551, "kl": 0.044189453125, "learning_rate": 1.949017549319983e-06, "loss": 0.0018, "reward": 1.546825647354126, "reward_std": 0.2987156808376312, "rewards/accuracy_reward": 0.546825647354126, "rewards/format_reward": 1.0, "step": 54 }, { "all_correct": 0.09375, "all_wrong": 0.0625, "completion_length": 98.734375, "epoch": 0.10396975425330812, "grad_norm": 1.9504297926987135, "kl": 0.0390625, "learning_rate": 1.947128796721759e-06, "loss": 0.0016, "reward": 1.6038429737091064, "reward_std": 0.3754524886608124, "rewards/accuracy_reward": 0.6077491641044617, "rewards/format_reward": 0.99609375, "step": 55 }, { "all_correct": 0.125, "all_wrong": 0.09375, "completion_length": 93.5859375, "epoch": 0.10586011342155009, "grad_norm": 3.287003319255905, "kl": 0.041748046875, "learning_rate": 1.9452066403011253e-06, "loss": 0.0017, "reward": 1.4696145057678223, "reward_std": 0.34197893738746643, "rewards/accuracy_reward": 0.47352084517478943, "rewards/format_reward": 0.99609375, "step": 56 }, { "all_correct": 0.21875, "all_wrong": 0.09375, "completion_length": 94.36328125, "epoch": 0.10775047258979206, "grad_norm": 3.2939637852247685, "kl": 0.04345703125, "learning_rate": 1.9432511478496766e-06, "loss": 0.0017, "reward": 1.5146770477294922, "reward_std": 0.26444536447525024, "rewards/accuracy_reward": 0.514677107334137, "rewards/format_reward": 1.0, "step": 57 }, { "all_correct": 0.1875, "all_wrong": 0.15625, "completion_length": 92.765625, "epoch": 0.10964083175803403, "grad_norm": 2.1754630667582413, "kl": 0.046630859375, "learning_rate": 1.9412623883347206e-06, "loss": 0.0019, "reward": 1.5093607902526855, "reward_std": 0.26719433069229126, "rewards/accuracy_reward": 0.5171732902526855, "rewards/format_reward": 0.9921875, "step": 58 }, { "all_correct": 0.125, "all_wrong": 0.09375, "completion_length": 89.7890625, "epoch": 0.11153119092627599, "grad_norm": 3.158327945972579, "kl": 0.042236328125, "learning_rate": 1.939240431896844e-06, "loss": 0.0017, "reward": 1.4645804166793823, "reward_std": 0.333609402179718, "rewards/accuracy_reward": 0.46848660707473755, "rewards/format_reward": 0.99609375, "step": 59 }, { "all_correct": 0.125, "all_wrong": 0.1875, "completion_length": 92.24609375, "epoch": 0.11342155009451796, "grad_norm": 4.067057374654062, "kl": 0.04736328125, "learning_rate": 1.937185349847439e-06, "loss": 0.0019, "reward": 1.4154318571090698, "reward_std": 0.28358522057533264, "rewards/accuracy_reward": 0.41543182730674744, "rewards/format_reward": 1.0, "step": 60 }, { "all_correct": 0.1875, "all_wrong": 0.21875, "completion_length": 85.125, "epoch": 0.11531190926275993, "grad_norm": 10.445979366082359, "kl": 0.048095703125, "learning_rate": 1.9350972146661903e-06, "loss": 0.0019, "reward": 1.5046515464782715, "reward_std": 0.24698607623577118, "rewards/accuracy_reward": 0.5085577964782715, "rewards/format_reward": 0.99609375, "step": 61 }, { "all_correct": 0.15625, "all_wrong": 0.1875, "completion_length": 87.96484375, "epoch": 0.11720226843100189, "grad_norm": 4.779328158469133, "kl": 0.04736328125, "learning_rate": 1.9329760999985165e-06, "loss": 0.0019, "reward": 1.4373040199279785, "reward_std": 0.29254239797592163, "rewards/accuracy_reward": 0.4373040795326233, "rewards/format_reward": 1.0, "step": 62 }, { "all_correct": 0.1875, "all_wrong": 0.125, "completion_length": 85.94140625, "epoch": 0.11909262759924386, "grad_norm": 2.355263239902568, "kl": 0.052490234375, "learning_rate": 1.9308220806529737e-06, "loss": 0.0021, "reward": 1.4870660305023193, "reward_std": 0.3119330108165741, "rewards/accuracy_reward": 0.49097222089767456, "rewards/format_reward": 0.99609375, "step": 63 }, { "all_correct": 0.1875, "all_wrong": 0.09375, "completion_length": 82.74609375, "epoch": 0.12098298676748583, "grad_norm": 2.4735239418793604, "kl": 0.050048828125, "learning_rate": 1.9286352325986163e-06, "loss": 0.002, "reward": 1.5674299001693726, "reward_std": 0.2720338702201843, "rewards/accuracy_reward": 0.5791486501693726, "rewards/format_reward": 0.98828125, "step": 64 }, { "all_correct": 0.15625, "all_wrong": 0.09375, "completion_length": 84.96875, "epoch": 0.12287334593572778, "grad_norm": 3.2309139052967457, "kl": 0.05517578125, "learning_rate": 1.9264156329623195e-06, "loss": 0.0022, "reward": 1.5520949363708496, "reward_std": 0.2912652790546417, "rewards/accuracy_reward": 0.5520949363708496, "rewards/format_reward": 1.0, "step": 65 }, { "all_correct": 0.21875, "all_wrong": 0.125, "completion_length": 86.56640625, "epoch": 0.12476370510396975, "grad_norm": 3.426837800551729, "kl": 0.05126953125, "learning_rate": 1.9241633600260575e-06, "loss": 0.0021, "reward": 1.5658715963363647, "reward_std": 0.2718903720378876, "rewards/accuracy_reward": 0.5658715963363647, "rewards/format_reward": 1.0, "step": 66 }, { "all_correct": 0.15625, "all_wrong": 0.125, "completion_length": 85.89453125, "epoch": 0.1266540642722117, "grad_norm": 5.040208361814542, "kl": 0.046630859375, "learning_rate": 1.921878493224143e-06, "loss": 0.0019, "reward": 1.4924273490905762, "reward_std": 0.2940727174282074, "rewards/accuracy_reward": 0.49633359909057617, "rewards/format_reward": 0.99609375, "step": 67 }, { "all_correct": 0.3125, "all_wrong": 0.125, "completion_length": 78.671875, "epoch": 0.1285444234404537, "grad_norm": 2.2476945314714643, "kl": 0.050048828125, "learning_rate": 1.9195611131404267e-06, "loss": 0.002, "reward": 1.5983318090438843, "reward_std": 0.21439965069293976, "rewards/accuracy_reward": 0.5983318090438843, "rewards/format_reward": 1.0, "step": 68 }, { "all_correct": 0.15625, "all_wrong": 0.25, "completion_length": 87.28515625, "epoch": 0.13043478260869565, "grad_norm": 1.8603814885894778, "kl": 0.043701171875, "learning_rate": 1.9172113015054528e-06, "loss": 0.0017, "reward": 1.3991045951843262, "reward_std": 0.22454139590263367, "rewards/accuracy_reward": 0.4030107259750366, "rewards/format_reward": 0.99609375, "step": 69 }, { "all_correct": 0.1875, "all_wrong": 0.09375, "completion_length": 85.96484375, "epoch": 0.1323251417769376, "grad_norm": 3.4308816795426536, "kl": 0.0537109375, "learning_rate": 1.9148291411935796e-06, "loss": 0.0022, "reward": 1.5655429363250732, "reward_std": 0.3076818287372589, "rewards/accuracy_reward": 0.5655430555343628, "rewards/format_reward": 1.0, "step": 70 }, { "all_correct": 0.15625, "all_wrong": 0.09375, "completion_length": 91.11328125, "epoch": 0.1342155009451796, "grad_norm": 1.959995440169171, "kl": 0.0517578125, "learning_rate": 1.9124147162200534e-06, "loss": 0.0021, "reward": 1.4995213747024536, "reward_std": 0.3407973051071167, "rewards/accuracy_reward": 0.5073338747024536, "rewards/format_reward": 0.9921875, "step": 71 }, { "all_correct": 0.3125, "all_wrong": 0.15625, "completion_length": 79.31640625, "epoch": 0.13610586011342155, "grad_norm": 1.8897107714275212, "kl": 0.053466796875, "learning_rate": 1.9099681117380486e-06, "loss": 0.0021, "reward": 1.634920597076416, "reward_std": 0.21926391124725342, "rewards/accuracy_reward": 0.6349206566810608, "rewards/format_reward": 1.0, "step": 72 }, { "all_correct": 0.4375, "all_wrong": 0.09375, "completion_length": 83.95703125, "epoch": 0.13799621928166353, "grad_norm": 1.6792021771564014, "kl": 0.052490234375, "learning_rate": 1.907489414035662e-06, "loss": 0.0021, "reward": 1.7418066263198853, "reward_std": 0.20898818969726562, "rewards/accuracy_reward": 0.7457128763198853, "rewards/format_reward": 0.99609375, "step": 73 }, { "all_correct": 0.21875, "all_wrong": 0.25, "completion_length": 82.68359375, "epoch": 0.13988657844990549, "grad_norm": 18.661291363199766, "kl": 0.0556640625, "learning_rate": 1.9049787105328714e-06, "loss": 0.0022, "reward": 1.5272233486175537, "reward_std": 0.22412577271461487, "rewards/accuracy_reward": 0.5311296582221985, "rewards/format_reward": 0.99609375, "step": 74 }, { "all_correct": 0.21875, "all_wrong": 0.125, "completion_length": 86.87890625, "epoch": 0.14177693761814744, "grad_norm": 1.968107409229428, "kl": 0.05517578125, "learning_rate": 1.9024360897784505e-06, "loss": 0.0022, "reward": 1.53652024269104, "reward_std": 0.2906913161277771, "rewards/accuracy_reward": 0.5443326234817505, "rewards/format_reward": 0.9921875, "step": 75 }, { "all_correct": 0.25, "all_wrong": 0.15625, "completion_length": 89.5546875, "epoch": 0.14366729678638943, "grad_norm": 1.9463441104137431, "kl": 0.05322265625, "learning_rate": 1.8998616414468477e-06, "loss": 0.0021, "reward": 1.5365304946899414, "reward_std": 0.22610870003700256, "rewards/accuracy_reward": 0.5365304350852966, "rewards/format_reward": 1.0, "step": 76 }, { "all_correct": 0.1875, "all_wrong": 0.1875, "completion_length": 89.98046875, "epoch": 0.14555765595463138, "grad_norm": 4.159641747444261, "kl": 0.0517578125, "learning_rate": 1.897255456335022e-06, "loss": 0.0021, "reward": 1.530354619026184, "reward_std": 0.2696187496185303, "rewards/accuracy_reward": 0.5303546190261841, "rewards/format_reward": 1.0, "step": 77 }, { "all_correct": 0.28125, "all_wrong": 0.125, "completion_length": 86.078125, "epoch": 0.14744801512287334, "grad_norm": 1.8829618370755878, "kl": 0.054443359375, "learning_rate": 1.894617626359242e-06, "loss": 0.0022, "reward": 1.6262216567993164, "reward_std": 0.2085573971271515, "rewards/accuracy_reward": 0.6262217164039612, "rewards/format_reward": 1.0, "step": 78 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 92.19921875, "epoch": 0.14933837429111532, "grad_norm": 3.1281418285199996, "kl": 0.047607421875, "learning_rate": 1.8919482445518434e-06, "loss": 0.0019, "reward": 1.5566154718399048, "reward_std": 0.2710142731666565, "rewards/accuracy_reward": 0.5566154718399048, "rewards/format_reward": 1.0, "step": 79 }, { "all_correct": 0.3125, "all_wrong": 0.1875, "completion_length": 97.9453125, "epoch": 0.15122873345935728, "grad_norm": 1.8453083650465387, "kl": 0.0498046875, "learning_rate": 1.8892474050579476e-06, "loss": 0.002, "reward": 1.526172399520874, "reward_std": 0.15693798661231995, "rewards/accuracy_reward": 0.5261724591255188, "rewards/format_reward": 1.0, "step": 80 }, { "all_correct": 0.34375, "all_wrong": 0.1875, "completion_length": 96.15625, "epoch": 0.15311909262759923, "grad_norm": 1.618450951037405, "kl": 0.052734375, "learning_rate": 1.8865152031321425e-06, "loss": 0.0021, "reward": 1.5617804527282715, "reward_std": 0.20894506573677063, "rewards/accuracy_reward": 0.5656867027282715, "rewards/format_reward": 0.99609375, "step": 81 }, { "all_correct": 0.25, "all_wrong": 0.21875, "completion_length": 93.93359375, "epoch": 0.15500945179584122, "grad_norm": 3.3619693904490346, "kl": 0.049560546875, "learning_rate": 1.8837517351351212e-06, "loss": 0.002, "reward": 1.502871036529541, "reward_std": 0.2329874485731125, "rewards/accuracy_reward": 0.5028710961341858, "rewards/format_reward": 1.0, "step": 82 }, { "all_correct": 0.25, "all_wrong": 0.09375, "completion_length": 100.8671875, "epoch": 0.15689981096408318, "grad_norm": 2.0482069336725717, "kl": 0.05029296875, "learning_rate": 1.8809570985302861e-06, "loss": 0.002, "reward": 1.5919384956359863, "reward_std": 0.2818424105644226, "rewards/accuracy_reward": 0.5919384956359863, "rewards/format_reward": 1.0, "step": 83 }, { "all_correct": 0.3125, "all_wrong": 0.1875, "completion_length": 93.40234375, "epoch": 0.15879017013232513, "grad_norm": 1.5003724294387097, "kl": 0.0498046875, "learning_rate": 1.8781313918803083e-06, "loss": 0.002, "reward": 1.5504703521728516, "reward_std": 0.22670012712478638, "rewards/accuracy_reward": 0.5739079117774963, "rewards/format_reward": 0.9765625, "step": 84 }, { "all_correct": 0.375, "all_wrong": 0.1875, "completion_length": 87.7421875, "epoch": 0.16068052930056712, "grad_norm": 4.602359933501162, "kl": 0.054443359375, "learning_rate": 1.8752747148436542e-06, "loss": 0.0022, "reward": 1.5955908298492432, "reward_std": 0.164137065410614, "rewards/accuracy_reward": 0.5955909490585327, "rewards/format_reward": 1.0, "step": 85 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 92.83203125, "epoch": 0.16257088846880907, "grad_norm": 1.903155543992341, "kl": 0.052490234375, "learning_rate": 1.8723871681710694e-06, "loss": 0.0021, "reward": 1.4634450674057007, "reward_std": 0.20022635161876678, "rewards/accuracy_reward": 0.4634450674057007, "rewards/format_reward": 1.0, "step": 86 }, { "all_correct": 0.28125, "all_wrong": 0.25, "completion_length": 91.84765625, "epoch": 0.16446124763705103, "grad_norm": 1.6427736050864759, "kl": 0.06640625, "learning_rate": 1.8694688537020258e-06, "loss": 0.0027, "reward": 1.4603149890899658, "reward_std": 0.1893424689769745, "rewards/accuracy_reward": 0.468127578496933, "rewards/format_reward": 0.9921875, "step": 87 }, { "all_correct": 0.3125, "all_wrong": 0.28125, "completion_length": 98.12890625, "epoch": 0.166351606805293, "grad_norm": 1.2849844281526226, "kl": 0.053955078125, "learning_rate": 1.866519874361129e-06, "loss": 0.0022, "reward": 1.493840217590332, "reward_std": 0.1990819126367569, "rewards/accuracy_reward": 0.5055589079856873, "rewards/format_reward": 0.98828125, "step": 88 }, { "all_correct": 0.28125, "all_wrong": 0.03125, "completion_length": 97.5546875, "epoch": 0.16824196597353497, "grad_norm": 3.36212198927166, "kl": 0.051513671875, "learning_rate": 1.8635403341544897e-06, "loss": 0.0021, "reward": 1.6382396221160889, "reward_std": 0.29065367579460144, "rewards/accuracy_reward": 0.6421457529067993, "rewards/format_reward": 0.99609375, "step": 89 }, { "all_correct": 0.25, "all_wrong": 0.21875, "completion_length": 93.35546875, "epoch": 0.17013232514177692, "grad_norm": 1.964853567576083, "kl": 0.052001953125, "learning_rate": 1.8605303381660542e-06, "loss": 0.0021, "reward": 1.4816901683807373, "reward_std": 0.2411368191242218, "rewards/accuracy_reward": 0.48559650778770447, "rewards/format_reward": 0.99609375, "step": 90 }, { "all_correct": 0.34375, "all_wrong": 0.15625, "completion_length": 97.84375, "epoch": 0.1720226843100189, "grad_norm": 1.7427774741536497, "kl": 0.048583984375, "learning_rate": 1.8574899925538995e-06, "loss": 0.0019, "reward": 1.594164490699768, "reward_std": 0.22482213377952576, "rewards/accuracy_reward": 0.6176020503044128, "rewards/format_reward": 0.9765625, "step": 91 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 96.33203125, "epoch": 0.17391304347826086, "grad_norm": 1.178350633076914, "kl": 0.0478515625, "learning_rate": 1.8544194045464886e-06, "loss": 0.0019, "reward": 1.570425271987915, "reward_std": 0.15437397360801697, "rewards/accuracy_reward": 0.5704251527786255, "rewards/format_reward": 1.0, "step": 92 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 96.08984375, "epoch": 0.17580340264650285, "grad_norm": 1.961774060919074, "kl": 0.050048828125, "learning_rate": 1.8513186824388878e-06, "loss": 0.002, "reward": 1.4320415258407593, "reward_std": 0.18912720680236816, "rewards/accuracy_reward": 0.44766655564308167, "rewards/format_reward": 0.984375, "step": 93 }, { "all_correct": 0.3125, "all_wrong": 0.25, "completion_length": 99.69921875, "epoch": 0.1776937618147448, "grad_norm": 1.5015570744898201, "kl": 0.046630859375, "learning_rate": 1.8481879355889493e-06, "loss": 0.0019, "reward": 1.5381855964660645, "reward_std": 0.2091609686613083, "rewards/accuracy_reward": 0.5459980964660645, "rewards/format_reward": 0.9921875, "step": 94 }, { "all_correct": 0.25, "all_wrong": 0.34375, "completion_length": 93.59765625, "epoch": 0.17958412098298676, "grad_norm": 1.5238358848316345, "kl": 0.0576171875, "learning_rate": 1.8450272744134533e-06, "loss": 0.0023, "reward": 1.4812531471252441, "reward_std": 0.15344488620758057, "rewards/accuracy_reward": 0.48125314712524414, "rewards/format_reward": 1.0, "step": 95 }, { "all_correct": 0.21875, "all_wrong": 0.3125, "completion_length": 101.0390625, "epoch": 0.18147448015122875, "grad_norm": 1.3836535628215803, "kl": 0.053955078125, "learning_rate": 1.8418368103842122e-06, "loss": 0.0022, "reward": 1.4727139472961426, "reward_std": 0.20200154185295105, "rewards/accuracy_reward": 0.4766201078891754, "rewards/format_reward": 0.99609375, "step": 96 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 100.37109375, "epoch": 0.1833648393194707, "grad_norm": 4.036599697785966, "kl": 0.04541015625, "learning_rate": 1.8386166560241431e-06, "loss": 0.0018, "reward": 1.5775189399719238, "reward_std": 0.2264866977930069, "rewards/accuracy_reward": 0.5775189399719238, "rewards/format_reward": 1.0, "step": 97 }, { "all_correct": 0.3125, "all_wrong": 0.15625, "completion_length": 91.52734375, "epoch": 0.18525519848771266, "grad_norm": 2.004043176233875, "kl": 0.048095703125, "learning_rate": 1.835366924903295e-06, "loss": 0.0019, "reward": 1.612939476966858, "reward_std": 0.20541028678417206, "rewards/accuracy_reward": 0.6129394769668579, "rewards/format_reward": 1.0, "step": 98 }, { "all_correct": 0.46875, "all_wrong": 0.1875, "completion_length": 86.5625, "epoch": 0.18714555765595464, "grad_norm": 1.4852739381285651, "kl": 0.04736328125, "learning_rate": 1.8320877316348453e-06, "loss": 0.0019, "reward": 1.6532118320465088, "reward_std": 0.14846470952033997, "rewards/accuracy_reward": 0.6532118320465088, "rewards/format_reward": 1.0, "step": 99 }, { "all_correct": 0.15625, "all_wrong": 0.125, "completion_length": 99.05078125, "epoch": 0.1890359168241966, "grad_norm": 4.076356880937712, "kl": 0.043701171875, "learning_rate": 1.8287791918710584e-06, "loss": 0.0017, "reward": 1.478124976158142, "reward_std": 0.32335546612739563, "rewards/accuracy_reward": 0.4859375059604645, "rewards/format_reward": 0.9921875, "step": 100 }, { "all_correct": 0.28125, "all_wrong": 0.03125, "completion_length": 91.46875, "epoch": 0.19092627599243855, "grad_norm": 3.6383673386285675, "kl": 0.04296875, "learning_rate": 1.8254414222992057e-06, "loss": 0.0017, "reward": 1.6602026224136353, "reward_std": 0.30003082752227783, "rewards/accuracy_reward": 0.6602025628089905, "rewards/format_reward": 1.0, "step": 101 }, { "all_correct": 0.25, "all_wrong": 0.0625, "completion_length": 90.078125, "epoch": 0.19281663516068054, "grad_norm": 2.071351404946499, "kl": 0.04833984375, "learning_rate": 1.8220745406374495e-06, "loss": 0.0019, "reward": 1.6283621788024902, "reward_std": 0.2751215100288391, "rewards/accuracy_reward": 0.6283620595932007, "rewards/format_reward": 1.0, "step": 102 }, { "all_correct": 0.21875, "all_wrong": 0.1875, "completion_length": 97.75390625, "epoch": 0.1947069943289225, "grad_norm": 1.97407172472788, "kl": 0.044921875, "learning_rate": 1.8186786656306934e-06, "loss": 0.0018, "reward": 1.5005707740783691, "reward_std": 0.2729160189628601, "rewards/accuracy_reward": 0.5083833336830139, "rewards/format_reward": 0.9921875, "step": 103 }, { "all_correct": 0.21875, "all_wrong": 0.125, "completion_length": 90.484375, "epoch": 0.19659735349716445, "grad_norm": 2.1911504081514734, "kl": 0.04345703125, "learning_rate": 1.8152539170463922e-06, "loss": 0.0017, "reward": 1.5098209381103516, "reward_std": 0.24280044436454773, "rewards/accuracy_reward": 0.5137272477149963, "rewards/format_reward": 0.99609375, "step": 104 }, { "all_correct": 0.375, "all_wrong": 0.0625, "completion_length": 91.6875, "epoch": 0.19848771266540643, "grad_norm": 2.1758066744800924, "kl": 0.048583984375, "learning_rate": 1.8118004156703295e-06, "loss": 0.0019, "reward": 1.666813611984253, "reward_std": 0.2204323410987854, "rewards/accuracy_reward": 0.6668134927749634, "rewards/format_reward": 1.0, "step": 105 }, { "all_correct": 0.3125, "all_wrong": 0.15625, "completion_length": 85.43359375, "epoch": 0.2003780718336484, "grad_norm": 2.2920515987360774, "kl": 0.048828125, "learning_rate": 1.808318283302356e-06, "loss": 0.002, "reward": 1.6315945386886597, "reward_std": 0.17060711979866028, "rewards/accuracy_reward": 0.6315945386886597, "rewards/format_reward": 1.0, "step": 106 }, { "all_correct": 0.40625, "all_wrong": 0.09375, "completion_length": 92.27734375, "epoch": 0.20226843100189035, "grad_norm": 1.6468108640255814, "kl": 0.048583984375, "learning_rate": 1.8048076427520956e-06, "loss": 0.0019, "reward": 1.6242039203643799, "reward_std": 0.2274279147386551, "rewards/accuracy_reward": 0.6281101703643799, "rewards/format_reward": 0.99609375, "step": 107 }, { "all_correct": 0.1875, "all_wrong": 0.34375, "completion_length": 90.59765625, "epoch": 0.20415879017013233, "grad_norm": 1.66499120414633, "kl": 0.052978515625, "learning_rate": 1.801268617834614e-06, "loss": 0.0021, "reward": 1.4206892251968384, "reward_std": 0.15304508805274963, "rewards/accuracy_reward": 0.4206892251968384, "rewards/format_reward": 1.0, "step": 108 }, { "all_correct": 0.4375, "all_wrong": 0.125, "completion_length": 92.9296875, "epoch": 0.2060491493383743, "grad_norm": 1.3046512489220061, "kl": 0.044677734375, "learning_rate": 1.7977013333660498e-06, "loss": 0.0018, "reward": 1.637601613998413, "reward_std": 0.17598497867584229, "rewards/accuracy_reward": 0.6571328639984131, "rewards/format_reward": 0.98046875, "step": 109 }, { "all_correct": 0.34375, "all_wrong": 0.21875, "completion_length": 87.953125, "epoch": 0.20793950850661624, "grad_norm": 1.9099290554016326, "kl": 0.04541015625, "learning_rate": 1.7941059151592145e-06, "loss": 0.0018, "reward": 1.579087734222412, "reward_std": 0.17340317368507385, "rewards/accuracy_reward": 0.5829939842224121, "rewards/format_reward": 0.99609375, "step": 110 }, { "all_correct": 0.40625, "all_wrong": 0.21875, "completion_length": 93.56640625, "epoch": 0.20982986767485823, "grad_norm": 6.034052106072661, "kl": 0.046142578125, "learning_rate": 1.7904824900191555e-06, "loss": 0.0018, "reward": 1.5380107164382935, "reward_std": 0.1255079060792923, "rewards/accuracy_reward": 0.5380107164382935, "rewards/format_reward": 1.0, "step": 111 }, { "all_correct": 0.34375, "all_wrong": 0.25, "completion_length": 92.42578125, "epoch": 0.21172022684310018, "grad_norm": 1.4881947802094542, "kl": 0.050537109375, "learning_rate": 1.786831185738682e-06, "loss": 0.002, "reward": 1.5942175388336182, "reward_std": 0.12322144210338593, "rewards/accuracy_reward": 0.5981237888336182, "rewards/format_reward": 0.99609375, "step": 112 }, { "all_correct": 0.28125, "all_wrong": 0.1875, "completion_length": 89.22265625, "epoch": 0.21361058601134217, "grad_norm": 3.5770227394508876, "kl": 0.04931640625, "learning_rate": 1.7831521310938587e-06, "loss": 0.002, "reward": 1.5163066387176514, "reward_std": 0.1974867880344391, "rewards/accuracy_reward": 0.5163066387176514, "rewards/format_reward": 1.0, "step": 113 }, { "all_correct": 0.40625, "all_wrong": 0.09375, "completion_length": 82.1328125, "epoch": 0.21550094517958412, "grad_norm": 2.212730582665482, "kl": 0.0556640625, "learning_rate": 1.7794454558394657e-06, "loss": 0.0022, "reward": 1.6943297386169434, "reward_std": 0.18354207277297974, "rewards/accuracy_reward": 0.6943297982215881, "rewards/format_reward": 1.0, "step": 114 }, { "all_correct": 0.25, "all_wrong": 0.21875, "completion_length": 90.8203125, "epoch": 0.21739130434782608, "grad_norm": 4.361936414639028, "kl": 0.056396484375, "learning_rate": 1.7757112907044198e-06, "loss": 0.0023, "reward": 1.555484652519226, "reward_std": 0.1996951699256897, "rewards/accuracy_reward": 0.5554846525192261, "rewards/format_reward": 1.0, "step": 115 }, { "all_correct": 0.3125, "all_wrong": 0.25, "completion_length": 87.26171875, "epoch": 0.21928166351606806, "grad_norm": 1.8180751664027779, "kl": 0.0498046875, "learning_rate": 1.7719497673871651e-06, "loss": 0.002, "reward": 1.4978692531585693, "reward_std": 0.19965368509292603, "rewards/accuracy_reward": 0.4978693127632141, "rewards/format_reward": 1.0, "step": 116 }, { "all_correct": 0.4375, "all_wrong": 0.15625, "completion_length": 93.81640625, "epoch": 0.22117202268431002, "grad_norm": 1.8114429952435827, "kl": 0.047119140625, "learning_rate": 1.7681610185510283e-06, "loss": 0.0019, "reward": 1.6657145023345947, "reward_std": 0.15193983912467957, "rewards/accuracy_reward": 0.66962069272995, "rewards/format_reward": 0.99609375, "step": 117 }, { "all_correct": 0.3125, "all_wrong": 0.1875, "completion_length": 88.76171875, "epoch": 0.22306238185255198, "grad_norm": 3.054644149610559, "kl": 0.049072265625, "learning_rate": 1.7643451778195394e-06, "loss": 0.002, "reward": 1.5918383598327637, "reward_std": 0.20005470514297485, "rewards/accuracy_reward": 0.5918383002281189, "rewards/format_reward": 1.0, "step": 118 }, { "all_correct": 0.40625, "all_wrong": 0.125, "completion_length": 86.08203125, "epoch": 0.22495274102079396, "grad_norm": 2.092558777656238, "kl": 0.054443359375, "learning_rate": 1.7605023797717194e-06, "loss": 0.0022, "reward": 1.6277587413787842, "reward_std": 0.19753384590148926, "rewards/accuracy_reward": 0.6277587413787842, "rewards/format_reward": 1.0, "step": 119 }, { "all_correct": 0.34375, "all_wrong": 0.15625, "completion_length": 90.5859375, "epoch": 0.22684310018903592, "grad_norm": 4.221955954568503, "kl": 0.051025390625, "learning_rate": 1.7566327599373336e-06, "loss": 0.002, "reward": 1.6072583198547363, "reward_std": 0.1987745761871338, "rewards/accuracy_reward": 0.6072583794593811, "rewards/format_reward": 1.0, "step": 120 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 90.83984375, "epoch": 0.22873345935727787, "grad_norm": 1.8343630634483405, "kl": 0.0498046875, "learning_rate": 1.7527364547921118e-06, "loss": 0.002, "reward": 1.6175568103790283, "reward_std": 0.17853425443172455, "rewards/accuracy_reward": 0.6175566911697388, "rewards/format_reward": 1.0, "step": 121 }, { "all_correct": 0.46875, "all_wrong": 0.125, "completion_length": 84.7734375, "epoch": 0.23062381852551986, "grad_norm": 1.3989906719734049, "kl": 0.0615234375, "learning_rate": 1.748813601752935e-06, "loss": 0.0025, "reward": 1.7062667608261108, "reward_std": 0.1319604218006134, "rewards/accuracy_reward": 0.7062667012214661, "rewards/format_reward": 1.0, "step": 122 }, { "all_correct": 0.34375, "all_wrong": 0.15625, "completion_length": 91.40234375, "epoch": 0.23251417769376181, "grad_norm": 1.9445947479760342, "kl": 0.052978515625, "learning_rate": 1.7448643391729886e-06, "loss": 0.0021, "reward": 1.573242425918579, "reward_std": 0.2199619710445404, "rewards/accuracy_reward": 0.5849611759185791, "rewards/format_reward": 0.98828125, "step": 123 }, { "all_correct": 0.28125, "all_wrong": 0.25, "completion_length": 92.703125, "epoch": 0.23440453686200377, "grad_norm": 1.9821554924076896, "kl": 0.047607421875, "learning_rate": 1.7408888063368838e-06, "loss": 0.0019, "reward": 1.517979621887207, "reward_std": 0.1772761046886444, "rewards/accuracy_reward": 0.517979621887207, "rewards/format_reward": 1.0, "step": 124 }, { "all_correct": 0.25, "all_wrong": 0.1875, "completion_length": 94.2421875, "epoch": 0.23629489603024575, "grad_norm": 1.7860361568391379, "kl": 0.056640625, "learning_rate": 1.7368871434557445e-06, "loss": 0.0023, "reward": 1.5142911672592163, "reward_std": 0.24644066393375397, "rewards/accuracy_reward": 0.5221036672592163, "rewards/format_reward": 0.9921875, "step": 125 }, { "all_correct": 0.1875, "all_wrong": 0.0, "completion_length": 100.96484375, "epoch": 0.2381852551984877, "grad_norm": 2.329904142862184, "kl": 0.046875, "learning_rate": 1.7328594916622615e-06, "loss": 0.0019, "reward": 1.5455485582351685, "reward_std": 0.35480332374572754, "rewards/accuracy_reward": 0.5650798082351685, "rewards/format_reward": 0.98046875, "step": 126 }, { "all_correct": 0.3125, "all_wrong": 0.1875, "completion_length": 85.55859375, "epoch": 0.24007561436672967, "grad_norm": 1.3180884643239135, "kl": 0.045654296875, "learning_rate": 1.7288059930057165e-06, "loss": 0.0018, "reward": 1.626103401184082, "reward_std": 0.19042542576789856, "rewards/accuracy_reward": 0.633915901184082, "rewards/format_reward": 0.9921875, "step": 127 }, { "all_correct": 0.3125, "all_wrong": 0.15625, "completion_length": 96.4765625, "epoch": 0.24196597353497165, "grad_norm": 1.631849621742655, "kl": 0.05322265625, "learning_rate": 1.7247267904469723e-06, "loss": 0.0021, "reward": 1.5507967472076416, "reward_std": 0.21631184220314026, "rewards/accuracy_reward": 0.5586091876029968, "rewards/format_reward": 0.9921875, "step": 128 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 90.7265625, "epoch": 0.2438563327032136, "grad_norm": 2.019747189377822, "kl": 0.054931640625, "learning_rate": 1.7206220278534285e-06, "loss": 0.0022, "reward": 1.5207030773162842, "reward_std": 0.14799641072750092, "rewards/accuracy_reward": 0.528515636920929, "rewards/format_reward": 0.9921875, "step": 129 }, { "all_correct": 0.4375, "all_wrong": 0.15625, "completion_length": 92.640625, "epoch": 0.24574669187145556, "grad_norm": 1.3384988128455364, "kl": 0.054443359375, "learning_rate": 1.7164918499939501e-06, "loss": 0.0022, "reward": 1.621284008026123, "reward_std": 0.16135218739509583, "rewards/accuracy_reward": 0.6330028176307678, "rewards/format_reward": 0.98828125, "step": 130 }, { "all_correct": 0.375, "all_wrong": 0.21875, "completion_length": 93.1015625, "epoch": 0.24763705103969755, "grad_norm": 1.687141544181196, "kl": 0.0576171875, "learning_rate": 1.712336402533761e-06, "loss": 0.0023, "reward": 1.5703125, "reward_std": 0.19187898933887482, "rewards/accuracy_reward": 0.5703125, "rewards/format_reward": 1.0, "step": 131 }, { "all_correct": 0.34375, "all_wrong": 0.1875, "completion_length": 92.66796875, "epoch": 0.2495274102079395, "grad_norm": 79.59867493675014, "kl": 0.048828125, "learning_rate": 1.7081558320293053e-06, "loss": 0.002, "reward": 1.590255618095398, "reward_std": 0.1713361293077469, "rewards/accuracy_reward": 0.5941617488861084, "rewards/format_reward": 0.99609375, "step": 132 }, { "all_correct": 0.5, "all_wrong": 0.09375, "completion_length": 87.046875, "epoch": 0.2514177693761815, "grad_norm": 1.4501119642103175, "kl": 0.04833984375, "learning_rate": 1.7039502859230797e-06, "loss": 0.0019, "reward": 1.6924138069152832, "reward_std": 0.1737845093011856, "rewards/accuracy_reward": 0.6924139261245728, "rewards/format_reward": 1.0, "step": 133 }, { "all_correct": 0.40625, "all_wrong": 0.125, "completion_length": 83.3828125, "epoch": 0.2533081285444234, "grad_norm": 1.753007888815303, "kl": 0.060791015625, "learning_rate": 1.699719912538434e-06, "loss": 0.0024, "reward": 1.6514040231704712, "reward_std": 0.15027320384979248, "rewards/accuracy_reward": 0.6514040231704712, "rewards/format_reward": 1.0, "step": 134 }, { "all_correct": 0.1875, "all_wrong": 0.1875, "completion_length": 84.77734375, "epoch": 0.2551984877126654, "grad_norm": 2.162337902604871, "kl": 0.05712890625, "learning_rate": 1.6954648610743384e-06, "loss": 0.0023, "reward": 1.534517765045166, "reward_std": 0.2556450366973877, "rewards/accuracy_reward": 0.534517765045166, "rewards/format_reward": 1.0, "step": 135 }, { "all_correct": 0.21875, "all_wrong": 0.25, "completion_length": 91.46484375, "epoch": 0.2570888468809074, "grad_norm": 2.2567486633448244, "kl": 0.059326171875, "learning_rate": 1.6911852816001217e-06, "loss": 0.0024, "reward": 1.4765625, "reward_std": 0.22168521583080292, "rewards/accuracy_reward": 0.4765624701976776, "rewards/format_reward": 1.0, "step": 136 }, { "all_correct": 0.28125, "all_wrong": 0.15625, "completion_length": 95.22265625, "epoch": 0.2589792060491493, "grad_norm": 1.685961718455796, "kl": 0.053466796875, "learning_rate": 1.6868813250501808e-06, "loss": 0.0021, "reward": 1.5958139896392822, "reward_std": 0.23510046303272247, "rewards/accuracy_reward": 0.595814049243927, "rewards/format_reward": 1.0, "step": 137 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 92.0390625, "epoch": 0.2608695652173913, "grad_norm": 3.0694185267878074, "kl": 0.0517578125, "learning_rate": 1.682553143218654e-06, "loss": 0.0021, "reward": 1.6615285873413086, "reward_std": 0.18214064836502075, "rewards/accuracy_reward": 0.6810599565505981, "rewards/format_reward": 0.98046875, "step": 138 }, { "all_correct": 0.34375, "all_wrong": 0.1875, "completion_length": 95.0703125, "epoch": 0.2627599243856333, "grad_norm": 1.8498537052874253, "kl": 0.050537109375, "learning_rate": 1.6782008887540702e-06, "loss": 0.002, "reward": 1.5477758646011353, "reward_std": 0.20087680220603943, "rewards/accuracy_reward": 0.55558842420578, "rewards/format_reward": 0.9921875, "step": 139 }, { "all_correct": 0.25, "all_wrong": 0.3125, "completion_length": 90.0, "epoch": 0.2646502835538752, "grad_norm": 1.4885477074641043, "kl": 0.0556640625, "learning_rate": 1.6738247151539643e-06, "loss": 0.0022, "reward": 1.4492642879486084, "reward_std": 0.17115281522274017, "rewards/accuracy_reward": 0.4570767879486084, "rewards/format_reward": 0.9921875, "step": 140 }, { "all_correct": 0.1875, "all_wrong": 0.1875, "completion_length": 93.39453125, "epoch": 0.2665406427221172, "grad_norm": 1.8807841599551895, "kl": 0.047119140625, "learning_rate": 1.6694247767594622e-06, "loss": 0.0019, "reward": 1.4642714262008667, "reward_std": 0.260834664106369, "rewards/accuracy_reward": 0.4838026762008667, "rewards/format_reward": 0.98046875, "step": 141 }, { "all_correct": 0.46875, "all_wrong": 0.28125, "completion_length": 84.546875, "epoch": 0.2684310018903592, "grad_norm": 1.820094916728817, "kl": 0.059814453125, "learning_rate": 1.665001228749841e-06, "loss": 0.0024, "reward": 1.566421627998352, "reward_std": 0.1025347113609314, "rewards/accuracy_reward": 0.566421627998352, "rewards/format_reward": 1.0, "step": 142 }, { "all_correct": 0.34375, "all_wrong": 0.28125, "completion_length": 91.03125, "epoch": 0.27032136105860116, "grad_norm": 5.252949042087521, "kl": 0.05859375, "learning_rate": 1.6605542271370511e-06, "loss": 0.0023, "reward": 1.5315755605697632, "reward_std": 0.18068033456802368, "rewards/accuracy_reward": 0.5315755605697632, "rewards/format_reward": 1.0, "step": 143 }, { "all_correct": 0.4375, "all_wrong": 0.25, "completion_length": 100.015625, "epoch": 0.2722117202268431, "grad_norm": 1.417138335386092, "kl": 0.050537109375, "learning_rate": 1.6560839287602191e-06, "loss": 0.002, "reward": 1.5791447162628174, "reward_std": 0.14070533215999603, "rewards/accuracy_reward": 0.5986760258674622, "rewards/format_reward": 0.98046875, "step": 144 }, { "all_correct": 0.40625, "all_wrong": 0.3125, "completion_length": 94.4375, "epoch": 0.2741020793950851, "grad_norm": 1.8097027337596667, "kl": 0.050537109375, "learning_rate": 1.6515904912801118e-06, "loss": 0.002, "reward": 1.4946039915084839, "reward_std": 0.09259741008281708, "rewards/accuracy_reward": 0.4946039319038391, "rewards/format_reward": 1.0, "step": 145 }, { "all_correct": 0.375, "all_wrong": 0.21875, "completion_length": 90.09375, "epoch": 0.27599243856332706, "grad_norm": 1.7093281467382162, "kl": 0.056884765625, "learning_rate": 1.6470740731735786e-06, "loss": 0.0023, "reward": 1.5983502864837646, "reward_std": 0.1455744206905365, "rewards/accuracy_reward": 0.6022564768791199, "rewards/format_reward": 0.99609375, "step": 146 }, { "all_correct": 0.40625, "all_wrong": 0.375, "completion_length": 95.01953125, "epoch": 0.277882797731569, "grad_norm": 1.4598558142559408, "kl": 0.060546875, "learning_rate": 1.6425348337279617e-06, "loss": 0.0024, "reward": 1.524511694908142, "reward_std": 0.08749698102474213, "rewards/accuracy_reward": 0.5284179449081421, "rewards/format_reward": 0.99609375, "step": 147 }, { "all_correct": 0.3125, "all_wrong": 0.1875, "completion_length": 97.53515625, "epoch": 0.27977315689981097, "grad_norm": 1.6254939755919178, "kl": 0.0478515625, "learning_rate": 1.6379729330354773e-06, "loss": 0.0019, "reward": 1.5108827352523804, "reward_std": 0.1570434868335724, "rewards/accuracy_reward": 0.5108827352523804, "rewards/format_reward": 1.0, "step": 148 }, { "all_correct": 0.40625, "all_wrong": 0.21875, "completion_length": 102.171875, "epoch": 0.28166351606805295, "grad_norm": 6.200286331983825, "kl": 0.05322265625, "learning_rate": 1.63338853198757e-06, "loss": 0.0021, "reward": 1.59765625, "reward_std": 0.15199562907218933, "rewards/accuracy_reward": 0.59765625, "rewards/format_reward": 1.0, "step": 149 }, { "all_correct": 0.375, "all_wrong": 0.21875, "completion_length": 93.01953125, "epoch": 0.2835538752362949, "grad_norm": 1.3238758411422094, "kl": 0.06005859375, "learning_rate": 1.6287817922692394e-06, "loss": 0.0024, "reward": 1.5933270454406738, "reward_std": 0.1732364296913147, "rewards/accuracy_reward": 0.593326985836029, "rewards/format_reward": 1.0, "step": 150 }, { "all_correct": 0.375, "all_wrong": 0.28125, "completion_length": 100.703125, "epoch": 0.28544423440453687, "grad_norm": 1.4671353082453924, "kl": 0.05615234375, "learning_rate": 1.6241528763533351e-06, "loss": 0.0022, "reward": 1.521083950996399, "reward_std": 0.14775413274765015, "rewards/accuracy_reward": 0.5249902009963989, "rewards/format_reward": 0.99609375, "step": 151 }, { "all_correct": 0.28125, "all_wrong": 0.21875, "completion_length": 104.98046875, "epoch": 0.28733459357277885, "grad_norm": 1.7191498537725731, "kl": 0.0517578125, "learning_rate": 1.6195019474948298e-06, "loss": 0.0021, "reward": 1.5247777700424194, "reward_std": 0.1547752171754837, "rewards/accuracy_reward": 0.5247777700424194, "rewards/format_reward": 1.0, "step": 152 }, { "all_correct": 0.34375, "all_wrong": 0.34375, "completion_length": 95.25, "epoch": 0.2892249527410208, "grad_norm": 1.1025414542451693, "kl": 0.05224609375, "learning_rate": 1.6148291697250592e-06, "loss": 0.0021, "reward": 1.4935517311096191, "reward_std": 0.06223775073885918, "rewards/accuracy_reward": 0.49355170130729675, "rewards/format_reward": 1.0, "step": 153 }, { "all_correct": 0.3125, "all_wrong": 0.21875, "completion_length": 97.43359375, "epoch": 0.29111531190926276, "grad_norm": 3.2260317883554293, "kl": 0.05224609375, "learning_rate": 1.6101347078459374e-06, "loss": 0.0021, "reward": 1.5610017776489258, "reward_std": 0.1826038360595703, "rewards/accuracy_reward": 0.5610017776489258, "rewards/format_reward": 1.0, "step": 154 }, { "all_correct": 0.40625, "all_wrong": 0.125, "completion_length": 98.3828125, "epoch": 0.29300567107750475, "grad_norm": 3.2855377413213036, "kl": 0.054443359375, "learning_rate": 1.6054187274241447e-06, "loss": 0.0022, "reward": 1.6555452346801758, "reward_std": 0.21821025013923645, "rewards/accuracy_reward": 0.6828888654708862, "rewards/format_reward": 0.97265625, "step": 155 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 99.22265625, "epoch": 0.2948960302457467, "grad_norm": 3.1851976233257986, "kl": 0.049072265625, "learning_rate": 1.6006813947852892e-06, "loss": 0.002, "reward": 1.4952456951141357, "reward_std": 0.2039714902639389, "rewards/accuracy_reward": 0.49524572491645813, "rewards/format_reward": 1.0, "step": 156 }, { "all_correct": 0.28125, "all_wrong": 0.25, "completion_length": 92.09375, "epoch": 0.29678638941398866, "grad_norm": 2.1557933244114107, "kl": 0.055908203125, "learning_rate": 1.5959228770080389e-06, "loss": 0.0022, "reward": 1.54817795753479, "reward_std": 0.19130109250545502, "rewards/accuracy_reward": 0.54817795753479, "rewards/format_reward": 1.0, "step": 157 }, { "all_correct": 0.65625, "all_wrong": 0.0625, "completion_length": 87.30078125, "epoch": 0.29867674858223064, "grad_norm": 1.5896533313014187, "kl": 0.052490234375, "learning_rate": 1.5911433419182304e-06, "loss": 0.0021, "reward": 1.7363414764404297, "reward_std": 0.09228336811065674, "rewards/accuracy_reward": 0.7363415956497192, "rewards/format_reward": 1.0, "step": 158 }, { "all_correct": 0.34375, "all_wrong": 0.21875, "completion_length": 98.4140625, "epoch": 0.3005671077504726, "grad_norm": 8.013680822640337, "kl": 0.051513671875, "learning_rate": 1.5863429580829499e-06, "loss": 0.0021, "reward": 1.567818522453308, "reward_std": 0.161808043718338, "rewards/accuracy_reward": 0.5709435343742371, "rewards/format_reward": 0.99609375, "step": 159 }, { "all_correct": 0.3125, "all_wrong": 0.125, "completion_length": 95.6484375, "epoch": 0.30245746691871456, "grad_norm": 1.8629346782025675, "kl": 0.0595703125, "learning_rate": 1.5815218948045877e-06, "loss": 0.0024, "reward": 1.6700589656829834, "reward_std": 0.22937864065170288, "rewards/accuracy_reward": 0.6778714656829834, "rewards/format_reward": 0.9921875, "step": 160 }, { "all_correct": 0.28125, "all_wrong": 0.34375, "completion_length": 90.875, "epoch": 0.30434782608695654, "grad_norm": 1.2131759411170726, "kl": 0.0556640625, "learning_rate": 1.5766803221148673e-06, "loss": 0.0022, "reward": 1.476467490196228, "reward_std": 0.1405719369649887, "rewards/accuracy_reward": 0.4764673709869385, "rewards/format_reward": 1.0, "step": 161 }, { "all_correct": 0.34375, "all_wrong": 0.15625, "completion_length": 98.62109375, "epoch": 0.30623818525519847, "grad_norm": 1.575835196195424, "kl": 0.049072265625, "learning_rate": 1.571818410768848e-06, "loss": 0.002, "reward": 1.5944631099700928, "reward_std": 0.18872258067131042, "rewards/accuracy_reward": 0.594463050365448, "rewards/format_reward": 1.0, "step": 162 }, { "all_correct": 0.28125, "all_wrong": 0.28125, "completion_length": 98.5078125, "epoch": 0.30812854442344045, "grad_norm": 1.3803997172387867, "kl": 0.04736328125, "learning_rate": 1.566936332238904e-06, "loss": 0.0019, "reward": 1.5406312942504883, "reward_std": 0.14286097884178162, "rewards/accuracy_reward": 0.5484437942504883, "rewards/format_reward": 0.9921875, "step": 163 }, { "all_correct": 0.34375, "all_wrong": 0.15625, "completion_length": 96.4296875, "epoch": 0.31001890359168244, "grad_norm": 4.219394179057573, "kl": 0.05615234375, "learning_rate": 1.5620342587086756e-06, "loss": 0.0022, "reward": 1.6263850927352905, "reward_std": 0.19135481119155884, "rewards/accuracy_reward": 0.6263850927352905, "rewards/format_reward": 1.0, "step": 164 }, { "all_correct": 0.34375, "all_wrong": 0.28125, "completion_length": 95.87890625, "epoch": 0.31190926275992437, "grad_norm": 1.4782346468519787, "kl": 0.048095703125, "learning_rate": 1.5571123630669977e-06, "loss": 0.0019, "reward": 1.5589654445648193, "reward_std": 0.15776313841342926, "rewards/accuracy_reward": 0.5628716349601746, "rewards/format_reward": 0.99609375, "step": 165 }, { "all_correct": 0.21875, "all_wrong": 0.21875, "completion_length": 99.24609375, "epoch": 0.31379962192816635, "grad_norm": 1.7150440549446804, "kl": 0.04736328125, "learning_rate": 1.5521708189018004e-06, "loss": 0.0019, "reward": 1.4944391250610352, "reward_std": 0.2474714070558548, "rewards/accuracy_reward": 0.4944390654563904, "rewards/format_reward": 1.0, "step": 166 }, { "all_correct": 0.34375, "all_wrong": 0.21875, "completion_length": 92.48828125, "epoch": 0.31568998109640833, "grad_norm": 2.0979330226121116, "kl": 0.05224609375, "learning_rate": 1.5472098004939887e-06, "loss": 0.0021, "reward": 1.606818675994873, "reward_std": 0.19165176153182983, "rewards/accuracy_reward": 0.610724925994873, "rewards/format_reward": 0.99609375, "step": 167 }, { "all_correct": 0.375, "all_wrong": 0.4375, "completion_length": 98.86328125, "epoch": 0.31758034026465026, "grad_norm": 1.741385766018968, "kl": 0.0458984375, "learning_rate": 1.5422294828112952e-06, "loss": 0.0018, "reward": 1.4513907432556152, "reward_std": 0.07461512833833694, "rewards/accuracy_reward": 0.45529699325561523, "rewards/format_reward": 0.99609375, "step": 168 }, { "all_correct": 0.15625, "all_wrong": 0.28125, "completion_length": 104.0859375, "epoch": 0.31947069943289225, "grad_norm": 2.346864808547952, "kl": 0.041748046875, "learning_rate": 1.537230041502109e-06, "loss": 0.0017, "reward": 1.4551925659179688, "reward_std": 0.19008949398994446, "rewards/accuracy_reward": 0.46691131591796875, "rewards/format_reward": 0.98828125, "step": 169 }, { "all_correct": 0.5, "all_wrong": 0.15625, "completion_length": 90.89453125, "epoch": 0.32136105860113423, "grad_norm": 1.6647816583702195, "kl": 0.05078125, "learning_rate": 1.5322116528892807e-06, "loss": 0.002, "reward": 1.731924295425415, "reward_std": 0.12868990004062653, "rewards/accuracy_reward": 0.7319241762161255, "rewards/format_reward": 1.0, "step": 170 }, { "all_correct": 0.28125, "all_wrong": 0.28125, "completion_length": 97.6015625, "epoch": 0.32325141776937616, "grad_norm": 1.4603160187510866, "kl": 0.04833984375, "learning_rate": 1.527174493963905e-06, "loss": 0.0019, "reward": 1.4863568544387817, "reward_std": 0.18577060103416443, "rewards/accuracy_reward": 0.49416929483413696, "rewards/format_reward": 0.9921875, "step": 171 }, { "all_correct": 0.46875, "all_wrong": 0.21875, "completion_length": 95.61328125, "epoch": 0.32514177693761814, "grad_norm": 1.2779207893845528, "kl": 0.05419921875, "learning_rate": 1.5221187423790758e-06, "loss": 0.0022, "reward": 1.6187939643859863, "reward_std": 0.1073196530342102, "rewards/accuracy_reward": 0.6227001547813416, "rewards/format_reward": 0.99609375, "step": 172 }, { "all_correct": 0.34375, "all_wrong": 0.21875, "completion_length": 95.03125, "epoch": 0.3270321361058601, "grad_norm": 1.594280468798857, "kl": 0.05712890625, "learning_rate": 1.517044576443625e-06, "loss": 0.0023, "reward": 1.5663572549819946, "reward_std": 0.14794546365737915, "rewards/accuracy_reward": 0.5663573741912842, "rewards/format_reward": 1.0, "step": 173 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 96.19921875, "epoch": 0.32892249527410206, "grad_norm": 35.90029700014664, "kl": 0.056396484375, "learning_rate": 1.5119521751158296e-06, "loss": 0.0023, "reward": 1.7366479635238647, "reward_std": 0.1478818655014038, "rewards/accuracy_reward": 0.7366479635238647, "rewards/format_reward": 1.0, "step": 174 }, { "all_correct": 0.46875, "all_wrong": 0.1875, "completion_length": 94.14453125, "epoch": 0.33081285444234404, "grad_norm": 1.1879295692139324, "kl": 0.05029296875, "learning_rate": 1.5068417179971013e-06, "loss": 0.002, "reward": 1.6065101623535156, "reward_std": 0.1347871571779251, "rewards/accuracy_reward": 0.6104164123535156, "rewards/format_reward": 0.99609375, "step": 175 }, { "all_correct": 0.4375, "all_wrong": 0.1875, "completion_length": 92.0078125, "epoch": 0.332703213610586, "grad_norm": 1.7027171491945203, "kl": 0.056640625, "learning_rate": 1.5017133853256536e-06, "loss": 0.0023, "reward": 1.6415752172470093, "reward_std": 0.15971241891384125, "rewards/accuracy_reward": 0.6415751576423645, "rewards/format_reward": 1.0, "step": 176 }, { "all_correct": 0.4375, "all_wrong": 0.25, "completion_length": 93.62109375, "epoch": 0.33459357277882795, "grad_norm": 1.3008308802468183, "kl": 0.05517578125, "learning_rate": 1.4965673579701444e-06, "loss": 0.0022, "reward": 1.5429213047027588, "reward_std": 0.11785108596086502, "rewards/accuracy_reward": 0.5546400547027588, "rewards/format_reward": 0.98828125, "step": 177 }, { "all_correct": 0.21875, "all_wrong": 0.34375, "completion_length": 101.2578125, "epoch": 0.33648393194706994, "grad_norm": 2.0499201856789537, "kl": 0.04638671875, "learning_rate": 1.4914038174232954e-06, "loss": 0.0019, "reward": 1.4305205345153809, "reward_std": 0.18109694123268127, "rewards/accuracy_reward": 0.4383331537246704, "rewards/format_reward": 0.9921875, "step": 178 }, { "all_correct": 0.46875, "all_wrong": 0.15625, "completion_length": 94.9765625, "epoch": 0.3383742911153119, "grad_norm": 1.3560089780453208, "kl": 0.051025390625, "learning_rate": 1.4862229457954937e-06, "loss": 0.002, "reward": 1.6644376516342163, "reward_std": 0.12696640193462372, "rewards/accuracy_reward": 0.6644376516342163, "rewards/format_reward": 1.0, "step": 179 }, { "all_correct": 0.4375, "all_wrong": 0.21875, "completion_length": 95.62109375, "epoch": 0.34026465028355385, "grad_norm": 1.312847563615755, "kl": 0.052490234375, "learning_rate": 1.4810249258083676e-06, "loss": 0.0021, "reward": 1.621772050857544, "reward_std": 0.15236923098564148, "rewards/accuracy_reward": 0.621772050857544, "rewards/format_reward": 1.0, "step": 180 }, { "all_correct": 0.40625, "all_wrong": 0.3125, "completion_length": 97.97265625, "epoch": 0.34215500945179583, "grad_norm": 1.5335653397548066, "kl": 0.047119140625, "learning_rate": 1.475809940788342e-06, "loss": 0.0019, "reward": 1.5216861963272095, "reward_std": 0.09969654679298401, "rewards/accuracy_reward": 0.5255923867225647, "rewards/format_reward": 0.99609375, "step": 181 }, { "all_correct": 0.34375, "all_wrong": 0.1875, "completion_length": 90.13671875, "epoch": 0.3440453686200378, "grad_norm": 1.6455259403603757, "kl": 0.050537109375, "learning_rate": 1.4705781746601738e-06, "loss": 0.002, "reward": 1.569726586341858, "reward_std": 0.20621807873249054, "rewards/accuracy_reward": 0.5736328363418579, "rewards/format_reward": 0.99609375, "step": 182 }, { "all_correct": 0.34375, "all_wrong": 0.21875, "completion_length": 95.23828125, "epoch": 0.34593572778827975, "grad_norm": 1.7087145386845783, "kl": 0.048095703125, "learning_rate": 1.4653298119404645e-06, "loss": 0.0019, "reward": 1.566096544265747, "reward_std": 0.16752052307128906, "rewards/accuracy_reward": 0.5660964846611023, "rewards/format_reward": 1.0, "step": 183 }, { "all_correct": 0.4375, "all_wrong": 0.28125, "completion_length": 86.921875, "epoch": 0.34782608695652173, "grad_norm": 1.4554606229566311, "kl": 0.054443359375, "learning_rate": 1.460065037731152e-06, "loss": 0.0022, "reward": 1.592024326324463, "reward_std": 0.10995283722877502, "rewards/accuracy_reward": 0.5920243859291077, "rewards/format_reward": 1.0, "step": 184 }, { "all_correct": 0.5, "all_wrong": 0.21875, "completion_length": 94.37890625, "epoch": 0.3497164461247637, "grad_norm": 1.030689978981618, "kl": 0.044921875, "learning_rate": 1.454784037712984e-06, "loss": 0.0018, "reward": 1.6271023750305176, "reward_std": 0.09506059437990189, "rewards/accuracy_reward": 0.6271023750305176, "rewards/format_reward": 1.0, "step": 185 }, { "all_correct": 0.5625, "all_wrong": 0.1875, "completion_length": 92.4765625, "epoch": 0.3516068052930057, "grad_norm": 2.3312930785993657, "kl": 0.05224609375, "learning_rate": 1.449486998138968e-06, "loss": 0.0021, "reward": 1.6988677978515625, "reward_std": 0.08607158064842224, "rewards/accuracy_reward": 0.698867678642273, "rewards/format_reward": 1.0, "step": 186 }, { "all_correct": 0.3125, "all_wrong": 0.3125, "completion_length": 94.50390625, "epoch": 0.3534971644612476, "grad_norm": 1.370501060551843, "kl": 0.050048828125, "learning_rate": 1.4441741058278024e-06, "loss": 0.002, "reward": 1.5129822492599487, "reward_std": 0.1419016569852829, "rewards/accuracy_reward": 0.5129822492599487, "rewards/format_reward": 1.0, "step": 187 }, { "all_correct": 0.4375, "all_wrong": 0.1875, "completion_length": 89.53515625, "epoch": 0.3553875236294896, "grad_norm": 1.4724253630775335, "kl": 0.048828125, "learning_rate": 1.4388455481572878e-06, "loss": 0.002, "reward": 1.6355903148651123, "reward_std": 0.14424622058868408, "rewards/accuracy_reward": 0.6394965648651123, "rewards/format_reward": 0.99609375, "step": 188 }, { "all_correct": 0.46875, "all_wrong": 0.21875, "completion_length": 99.15234375, "epoch": 0.3572778827977316, "grad_norm": 1.1589530938405725, "kl": 0.046142578125, "learning_rate": 1.4335015130577198e-06, "loss": 0.0018, "reward": 1.6473121643066406, "reward_std": 0.10182252526283264, "rewards/accuracy_reward": 0.6473122239112854, "rewards/format_reward": 1.0, "step": 189 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 86.296875, "epoch": 0.3591682419659735, "grad_norm": 1.5269460633562093, "kl": 0.04541015625, "learning_rate": 1.428142189005259e-06, "loss": 0.0018, "reward": 1.7168800830841064, "reward_std": 0.12831273674964905, "rewards/accuracy_reward": 0.7168800234794617, "rewards/format_reward": 1.0, "step": 190 }, { "all_correct": 0.46875, "all_wrong": 0.125, "completion_length": 90.84765625, "epoch": 0.3610586011342155, "grad_norm": 1.5710398769636844, "kl": 0.051513671875, "learning_rate": 1.4227677650152847e-06, "loss": 0.0021, "reward": 1.6753089427947998, "reward_std": 0.15371738374233246, "rewards/accuracy_reward": 0.6753089427947998, "rewards/format_reward": 1.0, "step": 191 }, { "all_correct": 0.5625, "all_wrong": 0.15625, "completion_length": 97.41015625, "epoch": 0.3629489603024575, "grad_norm": 1.3229536281928582, "kl": 0.0478515625, "learning_rate": 1.417378430635729e-06, "loss": 0.0019, "reward": 1.6942713260650635, "reward_std": 0.08952474594116211, "rewards/accuracy_reward": 0.6942713260650635, "rewards/format_reward": 1.0, "step": 192 }, { "all_correct": 0.34375, "all_wrong": 0.1875, "completion_length": 85.03125, "epoch": 0.3648393194706994, "grad_norm": 2.579144764904904, "kl": 0.0537109375, "learning_rate": 1.4119743759403907e-06, "loss": 0.0021, "reward": 1.609615683555603, "reward_std": 0.1612345427274704, "rewards/accuracy_reward": 0.609615683555603, "rewards/format_reward": 1.0, "step": 193 }, { "all_correct": 0.46875, "all_wrong": 0.125, "completion_length": 98.421875, "epoch": 0.3667296786389414, "grad_norm": 4.601074879313099, "kl": 0.055908203125, "learning_rate": 1.406555791522232e-06, "loss": 0.0022, "reward": 1.6692792177200317, "reward_std": 0.1721545159816742, "rewards/accuracy_reward": 0.6731854677200317, "rewards/format_reward": 0.99609375, "step": 194 }, { "all_correct": 0.46875, "all_wrong": 0.25, "completion_length": 86.79296875, "epoch": 0.3686200378071834, "grad_norm": 1.505355960378617, "kl": 0.048095703125, "learning_rate": 1.401122868486658e-06, "loss": 0.0019, "reward": 1.6007030010223389, "reward_std": 0.1112457737326622, "rewards/accuracy_reward": 0.6007030606269836, "rewards/format_reward": 1.0, "step": 195 }, { "all_correct": 0.28125, "all_wrong": 0.1875, "completion_length": 92.05078125, "epoch": 0.3705103969754253, "grad_norm": 4.21038845661736, "kl": 0.048095703125, "learning_rate": 1.3956757984447744e-06, "loss": 0.0019, "reward": 1.5570985078811646, "reward_std": 0.15561020374298096, "rewards/accuracy_reward": 0.5570985078811646, "rewards/format_reward": 1.0, "step": 196 }, { "all_correct": 0.46875, "all_wrong": 0.1875, "completion_length": 108.05859375, "epoch": 0.3724007561436673, "grad_norm": 1.8421976998281828, "kl": 0.0458984375, "learning_rate": 1.3902147735066305e-06, "loss": 0.0018, "reward": 1.6113016605377197, "reward_std": 0.153774231672287, "rewards/accuracy_reward": 0.615207850933075, "rewards/format_reward": 0.99609375, "step": 197 }, { "all_correct": 0.375, "all_wrong": 0.15625, "completion_length": 100.32421875, "epoch": 0.3742911153119093, "grad_norm": 1.4377049848914711, "kl": 0.048583984375, "learning_rate": 1.3847399862744449e-06, "loss": 0.0019, "reward": 1.5971312522888184, "reward_std": 0.1938011348247528, "rewards/accuracy_reward": 0.6010375022888184, "rewards/format_reward": 0.99609375, "step": 198 }, { "all_correct": 0.375, "all_wrong": 0.09375, "completion_length": 94.0, "epoch": 0.3761814744801512, "grad_norm": 2.662364277169509, "kl": 0.053466796875, "learning_rate": 1.3792516298358115e-06, "loss": 0.0021, "reward": 1.6497100591659546, "reward_std": 0.12323208153247833, "rewards/accuracy_reward": 0.6497100591659546, "rewards/format_reward": 1.0, "step": 199 }, { "all_correct": 0.375, "all_wrong": 0.15625, "completion_length": 93.59765625, "epoch": 0.3780718336483932, "grad_norm": 1.7035529891437522, "kl": 0.049560546875, "learning_rate": 1.37374989775689e-06, "loss": 0.002, "reward": 1.563701868057251, "reward_std": 0.1927732229232788, "rewards/accuracy_reward": 0.563701868057251, "rewards/format_reward": 1.0, "step": 200 }, { "all_correct": 0.4375, "all_wrong": 0.21875, "completion_length": 85.83203125, "epoch": 0.3799621928166352, "grad_norm": 1.2571615120784285, "kl": 0.0537109375, "learning_rate": 1.3682349840755786e-06, "loss": 0.0021, "reward": 1.6499078273773193, "reward_std": 0.12151362746953964, "rewards/accuracy_reward": 0.6538141369819641, "rewards/format_reward": 0.99609375, "step": 201 }, { "all_correct": 0.1875, "all_wrong": 0.1875, "completion_length": 92.55078125, "epoch": 0.3818525519848771, "grad_norm": 1.8828233098315525, "kl": 0.052978515625, "learning_rate": 1.3627070832946716e-06, "loss": 0.0021, "reward": 1.5092294216156006, "reward_std": 0.23872140049934387, "rewards/accuracy_reward": 0.5092294216156006, "rewards/format_reward": 1.0, "step": 202 }, { "all_correct": 0.40625, "all_wrong": 0.1875, "completion_length": 90.94140625, "epoch": 0.3837429111531191, "grad_norm": 1.4414792921156796, "kl": 0.04833984375, "learning_rate": 1.3571663903749984e-06, "loss": 0.0019, "reward": 1.5634148120880127, "reward_std": 0.18181678652763367, "rewards/accuracy_reward": 0.5673210024833679, "rewards/format_reward": 0.99609375, "step": 203 }, { "all_correct": 0.375, "all_wrong": 0.21875, "completion_length": 93.390625, "epoch": 0.3856332703213611, "grad_norm": 1.76318060561253, "kl": 0.05615234375, "learning_rate": 1.351613100728548e-06, "loss": 0.0022, "reward": 1.5766992568969727, "reward_std": 0.18105342984199524, "rewards/accuracy_reward": 0.5766991972923279, "rewards/format_reward": 1.0, "step": 204 }, { "all_correct": 0.34375, "all_wrong": 0.1875, "completion_length": 91.85546875, "epoch": 0.387523629489603, "grad_norm": 1.498906587190416, "kl": 0.046142578125, "learning_rate": 1.3460474102115784e-06, "loss": 0.0018, "reward": 1.5816829204559326, "reward_std": 0.2008872926235199, "rewards/accuracy_reward": 0.5816829204559326, "rewards/format_reward": 1.0, "step": 205 }, { "all_correct": 0.3125, "all_wrong": 0.1875, "completion_length": 91.69921875, "epoch": 0.389413988657845, "grad_norm": 2.5901803764065447, "kl": 0.052001953125, "learning_rate": 1.340469515117706e-06, "loss": 0.0021, "reward": 1.5882611274719238, "reward_std": 0.1939556896686554, "rewards/accuracy_reward": 0.5882611274719238, "rewards/format_reward": 1.0, "step": 206 }, { "all_correct": 0.34375, "all_wrong": 0.125, "completion_length": 85.96875, "epoch": 0.391304347826087, "grad_norm": 1.972889284821711, "kl": 0.05615234375, "learning_rate": 1.334879612170986e-06, "loss": 0.0022, "reward": 1.667292833328247, "reward_std": 0.2238282561302185, "rewards/accuracy_reward": 0.6672928929328918, "rewards/format_reward": 1.0, "step": 207 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 93.765625, "epoch": 0.3931947069943289, "grad_norm": 1.7336225636379132, "kl": 0.0556640625, "learning_rate": 1.3292778985189722e-06, "loss": 0.0022, "reward": 1.4546148777008057, "reward_std": 0.202288419008255, "rewards/accuracy_reward": 0.4546148180961609, "rewards/format_reward": 1.0, "step": 208 }, { "all_correct": 0.34375, "all_wrong": 0.25, "completion_length": 93.42578125, "epoch": 0.3950850661625709, "grad_norm": 1.5326638144326488, "kl": 0.04296875, "learning_rate": 1.323664571725764e-06, "loss": 0.0017, "reward": 1.581210970878601, "reward_std": 0.13675948977470398, "rewards/accuracy_reward": 0.5812109708786011, "rewards/format_reward": 1.0, "step": 209 }, { "all_correct": 0.28125, "all_wrong": 0.25, "completion_length": 90.45703125, "epoch": 0.39697542533081287, "grad_norm": 1.6717753066186387, "kl": 0.055419921875, "learning_rate": 1.3180398297650392e-06, "loss": 0.0022, "reward": 1.5307865142822266, "reward_std": 0.21199406683444977, "rewards/accuracy_reward": 0.534692645072937, "rewards/format_reward": 0.99609375, "step": 210 }, { "all_correct": 0.25, "all_wrong": 0.15625, "completion_length": 88.4375, "epoch": 0.3988657844990548, "grad_norm": 3.0589163307910163, "kl": 0.045654296875, "learning_rate": 1.3124038710130721e-06, "loss": 0.0018, "reward": 1.4704865217208862, "reward_std": 0.22353151440620422, "rewards/accuracy_reward": 0.470486581325531, "rewards/format_reward": 1.0, "step": 211 }, { "all_correct": 0.3125, "all_wrong": 0.21875, "completion_length": 91.19140625, "epoch": 0.4007561436672968, "grad_norm": 1.7075754827161425, "kl": 0.052734375, "learning_rate": 1.3067568942417354e-06, "loss": 0.0021, "reward": 1.568968415260315, "reward_std": 0.19819122552871704, "rewards/accuracy_reward": 0.5689684152603149, "rewards/format_reward": 1.0, "step": 212 }, { "all_correct": 0.3125, "all_wrong": 0.15625, "completion_length": 92.08984375, "epoch": 0.40264650283553877, "grad_norm": 2.1354813874349254, "kl": 0.05029296875, "learning_rate": 1.3010990986114924e-06, "loss": 0.002, "reward": 1.533203125, "reward_std": 0.2084732949733734, "rewards/accuracy_reward": 0.552734375, "rewards/format_reward": 0.98046875, "step": 213 }, { "all_correct": 0.25, "all_wrong": 0.21875, "completion_length": 94.58203125, "epoch": 0.4045368620037807, "grad_norm": 2.165953349203724, "kl": 0.04345703125, "learning_rate": 1.29543068366437e-06, "loss": 0.0017, "reward": 1.511252760887146, "reward_std": 0.2207871973514557, "rewards/accuracy_reward": 0.511252760887146, "rewards/format_reward": 1.0, "step": 214 }, { "all_correct": 0.34375, "all_wrong": 0.28125, "completion_length": 91.39453125, "epoch": 0.4064272211720227, "grad_norm": 1.8035046342920265, "kl": 0.05322265625, "learning_rate": 1.2897518493169238e-06, "loss": 0.0021, "reward": 1.5383806228637695, "reward_std": 0.11602434515953064, "rewards/accuracy_reward": 0.54228675365448, "rewards/format_reward": 0.99609375, "step": 215 }, { "all_correct": 0.375, "all_wrong": 0.15625, "completion_length": 85.265625, "epoch": 0.40831758034026466, "grad_norm": 15.766212379911176, "kl": 0.045654296875, "learning_rate": 1.284062795853185e-06, "loss": 0.0018, "reward": 1.633461356163025, "reward_std": 0.1580500602722168, "rewards/accuracy_reward": 0.6334613561630249, "rewards/format_reward": 1.0, "step": 216 }, { "all_correct": 0.4375, "all_wrong": 0.09375, "completion_length": 91.390625, "epoch": 0.4102079395085066, "grad_norm": 1.5655799499341232, "kl": 0.05078125, "learning_rate": 1.2783637239175992e-06, "loss": 0.002, "reward": 1.670259952545166, "reward_std": 0.18198764324188232, "rewards/accuracy_reward": 0.670259952545166, "rewards/format_reward": 1.0, "step": 217 }, { "all_correct": 0.4375, "all_wrong": 0.125, "completion_length": 86.6171875, "epoch": 0.4120982986767486, "grad_norm": 3.0507186922583482, "kl": 0.0517578125, "learning_rate": 1.2726548345079474e-06, "loss": 0.0021, "reward": 1.6517325639724731, "reward_std": 0.17904990911483765, "rewards/accuracy_reward": 0.6517325639724731, "rewards/format_reward": 1.0, "step": 218 }, { "all_correct": 0.375, "all_wrong": 0.21875, "completion_length": 92.4140625, "epoch": 0.41398865784499056, "grad_norm": 2.412347509180631, "kl": 0.049560546875, "learning_rate": 1.2669363289682581e-06, "loss": 0.002, "reward": 1.55078125, "reward_std": 0.20410458743572235, "rewards/accuracy_reward": 0.57421875, "rewards/format_reward": 0.9765625, "step": 219 }, { "all_correct": 0.40625, "all_wrong": 0.1875, "completion_length": 100.796875, "epoch": 0.4158790170132325, "grad_norm": 1.1653385467510216, "kl": 0.05078125, "learning_rate": 1.261208408981708e-06, "loss": 0.002, "reward": 1.5812370777130127, "reward_std": 0.16820110380649567, "rewards/accuracy_reward": 0.5968619585037231, "rewards/format_reward": 0.984375, "step": 220 }, { "all_correct": 0.5, "all_wrong": 0.1875, "completion_length": 89.671875, "epoch": 0.41776937618147447, "grad_norm": 1.4929073817173293, "kl": 0.048828125, "learning_rate": 1.2554712765635057e-06, "loss": 0.0019, "reward": 1.6370710134506226, "reward_std": 0.11428119242191315, "rewards/accuracy_reward": 0.6370710134506226, "rewards/format_reward": 1.0, "step": 221 }, { "all_correct": 0.28125, "all_wrong": 0.1875, "completion_length": 85.70703125, "epoch": 0.41965973534971646, "grad_norm": 2.2938159154637265, "kl": 0.052490234375, "learning_rate": 1.2497251340537688e-06, "loss": 0.0021, "reward": 1.5362218618392944, "reward_std": 0.2033883035182953, "rewards/accuracy_reward": 0.5440343022346497, "rewards/format_reward": 0.9921875, "step": 222 }, { "all_correct": 0.375, "all_wrong": 0.09375, "completion_length": 88.765625, "epoch": 0.4215500945179584, "grad_norm": 1.590914762810909, "kl": 0.045654296875, "learning_rate": 1.2439701841103886e-06, "loss": 0.0018, "reward": 1.672126293182373, "reward_std": 0.18559589982032776, "rewards/accuracy_reward": 0.6721263527870178, "rewards/format_reward": 1.0, "step": 223 }, { "all_correct": 0.21875, "all_wrong": 0.21875, "completion_length": 88.734375, "epoch": 0.42344045368620037, "grad_norm": 4.4273662063319925, "kl": 0.051025390625, "learning_rate": 1.2382066297018804e-06, "loss": 0.002, "reward": 1.5649325847625732, "reward_std": 0.2352992594242096, "rewards/accuracy_reward": 0.5649325847625732, "rewards/format_reward": 1.0, "step": 224 }, { "all_correct": 0.5, "all_wrong": 0.1875, "completion_length": 88.78515625, "epoch": 0.42533081285444235, "grad_norm": 2.4145377912675245, "kl": 0.048828125, "learning_rate": 1.2324346741002259e-06, "loss": 0.002, "reward": 1.6205267906188965, "reward_std": 0.1337902843952179, "rewards/accuracy_reward": 0.6205266714096069, "rewards/format_reward": 1.0, "step": 225 }, { "all_correct": 0.21875, "all_wrong": 0.28125, "completion_length": 87.2734375, "epoch": 0.42722117202268434, "grad_norm": 1.3878030528846113, "kl": 0.049072265625, "learning_rate": 1.2266545208737054e-06, "loss": 0.002, "reward": 1.4790351390838623, "reward_std": 0.1779412180185318, "rewards/accuracy_reward": 0.47903522849082947, "rewards/format_reward": 1.0, "step": 226 }, { "all_correct": 0.375, "all_wrong": 0.3125, "completion_length": 85.23046875, "epoch": 0.42911153119092627, "grad_norm": 2.600137524410405, "kl": 0.05126953125, "learning_rate": 1.2208663738797165e-06, "loss": 0.0021, "reward": 1.5255839824676514, "reward_std": 0.10616068542003632, "rewards/accuracy_reward": 0.5255839824676514, "rewards/format_reward": 1.0, "step": 227 }, { "all_correct": 0.34375, "all_wrong": 0.25, "completion_length": 89.4375, "epoch": 0.43100189035916825, "grad_norm": 2.29638301880618, "kl": 0.0517578125, "learning_rate": 1.2150704372575853e-06, "loss": 0.0021, "reward": 1.525526762008667, "reward_std": 0.1583402007818222, "rewards/accuracy_reward": 0.5255266427993774, "rewards/format_reward": 1.0, "step": 228 }, { "all_correct": 0.53125, "all_wrong": 0.21875, "completion_length": 84.92578125, "epoch": 0.43289224952741023, "grad_norm": 1.144130055159072, "kl": 0.05322265625, "learning_rate": 1.2092669154213664e-06, "loss": 0.0021, "reward": 1.5835583209991455, "reward_std": 0.09602068364620209, "rewards/accuracy_reward": 0.5874645113945007, "rewards/format_reward": 0.99609375, "step": 229 }, { "all_correct": 0.28125, "all_wrong": 0.15625, "completion_length": 102.9765625, "epoch": 0.43478260869565216, "grad_norm": 2.039569056317988, "kl": 0.048583984375, "learning_rate": 1.203456013052634e-06, "loss": 0.0019, "reward": 1.5296072959899902, "reward_std": 0.2428017556667328, "rewards/accuracy_reward": 0.529607355594635, "rewards/format_reward": 1.0, "step": 230 }, { "all_correct": 0.40625, "all_wrong": 0.1875, "completion_length": 100.609375, "epoch": 0.43667296786389415, "grad_norm": 1.8374962293393675, "kl": 0.046630859375, "learning_rate": 1.1976379350932618e-06, "loss": 0.0019, "reward": 1.6126770973205566, "reward_std": 0.17599979043006897, "rewards/accuracy_reward": 0.6126769781112671, "rewards/format_reward": 1.0, "step": 231 }, { "all_correct": 0.46875, "all_wrong": 0.15625, "completion_length": 86.33203125, "epoch": 0.43856332703213613, "grad_norm": 1.3726940756027504, "kl": 0.04638671875, "learning_rate": 1.1918128867381965e-06, "loss": 0.0019, "reward": 1.6991832256317139, "reward_std": 0.14738750457763672, "rewards/accuracy_reward": 0.7030894160270691, "rewards/format_reward": 0.99609375, "step": 232 }, { "all_correct": 0.375, "all_wrong": 0.3125, "completion_length": 94.38671875, "epoch": 0.44045368620037806, "grad_norm": 2.07132527283432, "kl": 0.044189453125, "learning_rate": 1.1859810734282207e-06, "loss": 0.0018, "reward": 1.5090982913970947, "reward_std": 0.12135301530361176, "rewards/accuracy_reward": 0.5325357913970947, "rewards/format_reward": 0.9765625, "step": 233 }, { "all_correct": 0.375, "all_wrong": 0.34375, "completion_length": 89.13671875, "epoch": 0.44234404536862004, "grad_norm": 1.261505450158093, "kl": 0.049072265625, "learning_rate": 1.1801427008427063e-06, "loss": 0.002, "reward": 1.5006786584854126, "reward_std": 0.11618545651435852, "rewards/accuracy_reward": 0.5006786584854126, "rewards/format_reward": 1.0, "step": 234 }, { "all_correct": 0.5625, "all_wrong": 0.15625, "completion_length": 88.6015625, "epoch": 0.444234404536862, "grad_norm": 1.6264673282441953, "kl": 0.048583984375, "learning_rate": 1.1742979748923608e-06, "loss": 0.0019, "reward": 1.7040953636169434, "reward_std": 0.08967425674200058, "rewards/accuracy_reward": 0.7040954232215881, "rewards/format_reward": 1.0, "step": 235 }, { "all_correct": 0.375, "all_wrong": 0.21875, "completion_length": 87.04296875, "epoch": 0.44612476370510395, "grad_norm": 1.8797855226580642, "kl": 0.055908203125, "learning_rate": 1.1684471017119665e-06, "loss": 0.0022, "reward": 1.5616300106048584, "reward_std": 0.10765929520130157, "rewards/accuracy_reward": 0.565536379814148, "rewards/format_reward": 0.99609375, "step": 236 }, { "all_correct": 0.34375, "all_wrong": 0.25, "completion_length": 95.9375, "epoch": 0.44801512287334594, "grad_norm": 1.3012401350602132, "kl": 0.04541015625, "learning_rate": 1.1625902876531083e-06, "loss": 0.0018, "reward": 1.4816043376922607, "reward_std": 0.16820675134658813, "rewards/accuracy_reward": 0.5011356472969055, "rewards/format_reward": 0.98046875, "step": 237 }, { "all_correct": 0.40625, "all_wrong": 0.09375, "completion_length": 92.08203125, "epoch": 0.4499054820415879, "grad_norm": 2.6028023149547193, "kl": 0.046630859375, "learning_rate": 1.156727739276897e-06, "loss": 0.0019, "reward": 1.7202575206756592, "reward_std": 0.14813324809074402, "rewards/accuracy_reward": 0.7241637706756592, "rewards/format_reward": 0.99609375, "step": 238 }, { "all_correct": 0.4375, "all_wrong": 0.28125, "completion_length": 99.9296875, "epoch": 0.45179584120982985, "grad_norm": 2.287302725917376, "kl": 0.048583984375, "learning_rate": 1.1508596633466853e-06, "loss": 0.0019, "reward": 1.5219180583953857, "reward_std": 0.0942273810505867, "rewards/accuracy_reward": 0.5492618083953857, "rewards/format_reward": 0.97265625, "step": 239 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 101.42578125, "epoch": 0.45368620037807184, "grad_norm": 3.1114222325682803, "kl": 0.04541015625, "learning_rate": 1.1449862668207732e-06, "loss": 0.0018, "reward": 1.4496355056762695, "reward_std": 0.19880539178848267, "rewards/accuracy_reward": 0.46916675567626953, "rewards/format_reward": 0.98046875, "step": 240 }, { "all_correct": 0.375, "all_wrong": 0.21875, "completion_length": 97.19140625, "epoch": 0.4555765595463138, "grad_norm": 1.8268906039007968, "kl": 0.042724609375, "learning_rate": 1.1391077568451115e-06, "loss": 0.0017, "reward": 1.6206369400024414, "reward_std": 0.1697978675365448, "rewards/accuracy_reward": 0.6284493803977966, "rewards/format_reward": 0.9921875, "step": 241 }, { "all_correct": 0.5625, "all_wrong": 0.125, "completion_length": 83.81640625, "epoch": 0.45746691871455575, "grad_norm": 3.936269420804981, "kl": 0.0439453125, "learning_rate": 1.1332243407459938e-06, "loss": 0.0018, "reward": 1.7336182594299316, "reward_std": 0.11902132630348206, "rewards/accuracy_reward": 0.7336182594299316, "rewards/format_reward": 1.0, "step": 242 }, { "all_correct": 0.4375, "all_wrong": 0.15625, "completion_length": 94.34765625, "epoch": 0.45935727788279773, "grad_norm": 2.638770141408573, "kl": 0.048095703125, "learning_rate": 1.1273362260227457e-06, "loss": 0.0019, "reward": 1.6561558246612549, "reward_std": 0.16947349905967712, "rewards/accuracy_reward": 0.6600620746612549, "rewards/format_reward": 0.99609375, "step": 243 }, { "all_correct": 0.46875, "all_wrong": 0.125, "completion_length": 90.83984375, "epoch": 0.4612476370510397, "grad_norm": 1.8631283133654752, "kl": 0.050048828125, "learning_rate": 1.121443620340406e-06, "loss": 0.002, "reward": 1.65234375, "reward_std": 0.19478288292884827, "rewards/accuracy_reward": 0.68359375, "rewards/format_reward": 0.96875, "step": 244 }, { "all_correct": 0.21875, "all_wrong": 0.28125, "completion_length": 97.4296875, "epoch": 0.46313799621928164, "grad_norm": 1.614309504872873, "kl": 0.044189453125, "learning_rate": 1.1155467315224037e-06, "loss": 0.0018, "reward": 1.4352020025253296, "reward_std": 0.1766517162322998, "rewards/accuracy_reward": 0.435202032327652, "rewards/format_reward": 1.0, "step": 245 }, { "all_correct": 0.34375, "all_wrong": 0.1875, "completion_length": 91.640625, "epoch": 0.46502835538752363, "grad_norm": 1.6379757636235799, "kl": 0.0478515625, "learning_rate": 1.1096457675432264e-06, "loss": 0.0019, "reward": 1.5429686307907104, "reward_std": 0.14759376645088196, "rewards/accuracy_reward": 0.5429686307907104, "rewards/format_reward": 1.0, "step": 246 }, { "all_correct": 0.34375, "all_wrong": 0.0625, "completion_length": 91.984375, "epoch": 0.4669187145557656, "grad_norm": 2.5291223401978296, "kl": 0.044921875, "learning_rate": 1.1037409365210879e-06, "loss": 0.0018, "reward": 1.6242541074752808, "reward_std": 0.2227800339460373, "rewards/accuracy_reward": 0.6281603574752808, "rewards/format_reward": 0.99609375, "step": 247 }, { "all_correct": 0.40625, "all_wrong": 0.1875, "completion_length": 95.96875, "epoch": 0.46880907372400754, "grad_norm": 1.5228770177382556, "kl": 0.046630859375, "learning_rate": 1.0978324467105857e-06, "loss": 0.0019, "reward": 1.575097680091858, "reward_std": 0.1790701001882553, "rewards/accuracy_reward": 0.5790039300918579, "rewards/format_reward": 0.99609375, "step": 248 }, { "all_correct": 0.28125, "all_wrong": 0.21875, "completion_length": 91.23828125, "epoch": 0.4706994328922495, "grad_norm": 2.489169729265948, "kl": 0.053466796875, "learning_rate": 1.0919205064953581e-06, "loss": 0.0021, "reward": 1.5097450017929077, "reward_std": 0.20132069289684296, "rewards/accuracy_reward": 0.5097450017929077, "rewards/format_reward": 1.0, "step": 249 }, { "all_correct": 0.34375, "all_wrong": 0.28125, "completion_length": 85.8046875, "epoch": 0.4725897920604915, "grad_norm": 2.586228799524351, "kl": 0.0498046875, "learning_rate": 1.0860053243807336e-06, "loss": 0.002, "reward": 1.514784574508667, "reward_std": 0.14834506809711456, "rewards/accuracy_reward": 0.514784574508667, "rewards/format_reward": 1.0, "step": 250 }, { "all_correct": 0.28125, "all_wrong": 0.15625, "completion_length": 95.62109375, "epoch": 0.47448015122873344, "grad_norm": 1.713585099464109, "kl": 0.042236328125, "learning_rate": 1.0800871089863784e-06, "loss": 0.0017, "reward": 1.586524248123169, "reward_std": 0.20916607975959778, "rewards/accuracy_reward": 0.586524248123169, "rewards/format_reward": 1.0, "step": 251 }, { "all_correct": 0.4375, "all_wrong": 0.21875, "completion_length": 84.66015625, "epoch": 0.4763705103969754, "grad_norm": 2.02730635206175, "kl": 0.052490234375, "learning_rate": 1.0741660690389365e-06, "loss": 0.0021, "reward": 1.6193575859069824, "reward_std": 0.1369372010231018, "rewards/accuracy_reward": 0.6193576455116272, "rewards/format_reward": 1.0, "step": 252 }, { "all_correct": 0.34375, "all_wrong": 0.25, "completion_length": 94.953125, "epoch": 0.4782608695652174, "grad_norm": 2.0450185647249373, "kl": 0.04541015625, "learning_rate": 1.068242413364671e-06, "loss": 0.0018, "reward": 1.5561109781265259, "reward_std": 0.18579518795013428, "rewards/accuracy_reward": 0.5561109781265259, "rewards/format_reward": 1.0, "step": 253 }, { "all_correct": 0.5, "all_wrong": 0.21875, "completion_length": 86.5078125, "epoch": 0.48015122873345933, "grad_norm": 3.4092659487344292, "kl": 0.050048828125, "learning_rate": 1.0623163508820976e-06, "loss": 0.002, "reward": 1.5766924619674683, "reward_std": 0.10343727469444275, "rewards/accuracy_reward": 0.5766924619674683, "rewards/format_reward": 1.0, "step": 254 }, { "all_correct": 0.46875, "all_wrong": 0.15625, "completion_length": 89.734375, "epoch": 0.4820415879017013, "grad_norm": 1.4529133237212937, "kl": 0.0517578125, "learning_rate": 1.0563880905946158e-06, "loss": 0.0021, "reward": 1.6530107259750366, "reward_std": 0.17242193222045898, "rewards/accuracy_reward": 0.6530107259750366, "rewards/format_reward": 1.0, "step": 255 }, { "all_correct": 0.4375, "all_wrong": 0.09375, "completion_length": 85.3359375, "epoch": 0.4839319470699433, "grad_norm": 1.6167911610708365, "kl": 0.0478515625, "learning_rate": 1.0504578415831394e-06, "loss": 0.0019, "reward": 1.7061023712158203, "reward_std": 0.1580139398574829, "rewards/accuracy_reward": 0.7061024904251099, "rewards/format_reward": 1.0, "step": 256 }, { "all_correct": 0.40625, "all_wrong": 0.1875, "completion_length": 89.16015625, "epoch": 0.48582230623818523, "grad_norm": 3.1031477511238963, "kl": 0.051513671875, "learning_rate": 1.0445258129987204e-06, "loss": 0.0021, "reward": 1.5749967098236084, "reward_std": 0.12671510875225067, "rewards/accuracy_reward": 0.5749967098236084, "rewards/format_reward": 1.0, "step": 257 }, { "all_correct": 0.3125, "all_wrong": 0.21875, "completion_length": 97.05078125, "epoch": 0.4877126654064272, "grad_norm": 1.587621420217378, "kl": 0.0478515625, "learning_rate": 1.0385922140551751e-06, "loss": 0.0019, "reward": 1.5610603094100952, "reward_std": 0.15413255989551544, "rewards/accuracy_reward": 0.5610603094100952, "rewards/format_reward": 1.0, "step": 258 }, { "all_correct": 0.40625, "all_wrong": 0.1875, "completion_length": 92.26171875, "epoch": 0.4896030245746692, "grad_norm": 1.4717842344274057, "kl": 0.05078125, "learning_rate": 1.0326572540217027e-06, "loss": 0.002, "reward": 1.5245153903961182, "reward_std": 0.1461203396320343, "rewards/accuracy_reward": 0.5245153307914734, "rewards/format_reward": 1.0, "step": 259 }, { "all_correct": 0.59375, "all_wrong": 0.1875, "completion_length": 94.67578125, "epoch": 0.4914933837429111, "grad_norm": 1.2872025757031114, "kl": 0.048828125, "learning_rate": 1.026721142215507e-06, "loss": 0.002, "reward": 1.6810582876205444, "reward_std": 0.0886523649096489, "rewards/accuracy_reward": 0.6810582876205444, "rewards/format_reward": 1.0, "step": 260 }, { "all_correct": 0.375, "all_wrong": 0.1875, "completion_length": 88.6640625, "epoch": 0.4933837429111531, "grad_norm": 2.8413160569475533, "kl": 0.04931640625, "learning_rate": 1.0207840879944122e-06, "loss": 0.002, "reward": 1.634856104850769, "reward_std": 0.16094039380550385, "rewards/accuracy_reward": 0.634856104850769, "rewards/format_reward": 1.0, "step": 261 }, { "all_correct": 0.34375, "all_wrong": 0.3125, "completion_length": 92.28515625, "epoch": 0.4952741020793951, "grad_norm": 1.3820672801691911, "kl": 0.046875, "learning_rate": 1.014846300749481e-06, "loss": 0.0019, "reward": 1.5555245876312256, "reward_std": 0.14345505833625793, "rewards/accuracy_reward": 0.5555245876312256, "rewards/format_reward": 1.0, "step": 262 }, { "all_correct": 0.375, "all_wrong": 0.09375, "completion_length": 90.9765625, "epoch": 0.497164461247637, "grad_norm": 2.081869839864706, "kl": 0.051025390625, "learning_rate": 1.0089079898976282e-06, "loss": 0.002, "reward": 1.6466023921966553, "reward_std": 0.1819521188735962, "rewards/accuracy_reward": 0.6466023921966553, "rewards/format_reward": 1.0, "step": 263 }, { "all_correct": 0.28125, "all_wrong": 0.21875, "completion_length": 94.72265625, "epoch": 0.499054820415879, "grad_norm": 2.580070748827961, "kl": 0.044677734375, "learning_rate": 1.0029693648742354e-06, "loss": 0.0018, "reward": 1.5194728374481201, "reward_std": 0.21391144394874573, "rewards/accuracy_reward": 0.5272853970527649, "rewards/format_reward": 0.9921875, "step": 264 }, { "all_correct": 0.3125, "all_wrong": 0.125, "completion_length": 89.3515625, "epoch": 0.500945179584121, "grad_norm": 2.2976847333751, "kl": 0.05224609375, "learning_rate": 9.970306351257645e-07, "loss": 0.0021, "reward": 1.6085888147354126, "reward_std": 0.23032766580581665, "rewards/accuracy_reward": 0.6085888147354126, "rewards/format_reward": 1.0, "step": 265 }, { "all_correct": 0.28125, "all_wrong": 0.25, "completion_length": 88.63671875, "epoch": 0.502835538752363, "grad_norm": 1.5748351110633412, "kl": 0.0576171875, "learning_rate": 9.910920101023717e-07, "loss": 0.0023, "reward": 1.4456298351287842, "reward_std": 0.1709638237953186, "rewards/accuracy_reward": 0.4456298351287842, "rewards/format_reward": 1.0, "step": 266 }, { "all_correct": 0.34375, "all_wrong": 0.125, "completion_length": 91.58984375, "epoch": 0.504725897920605, "grad_norm": 2.770591512834709, "kl": 0.0517578125, "learning_rate": 9.851536992505187e-07, "loss": 0.0021, "reward": 1.653957724571228, "reward_std": 0.20085959136486053, "rewards/accuracy_reward": 0.653957724571228, "rewards/format_reward": 1.0, "step": 267 }, { "all_correct": 0.34375, "all_wrong": 0.15625, "completion_length": 91.51953125, "epoch": 0.5066162570888468, "grad_norm": 1.6226350288665303, "kl": 0.047119140625, "learning_rate": 9.792159120055879e-07, "loss": 0.0019, "reward": 1.5789850950241089, "reward_std": 0.17338192462921143, "rewards/accuracy_reward": 0.5789849758148193, "rewards/format_reward": 1.0, "step": 268 }, { "all_correct": 0.28125, "all_wrong": 0.34375, "completion_length": 93.02734375, "epoch": 0.5085066162570888, "grad_norm": 1.2047460673900767, "kl": 0.051513671875, "learning_rate": 9.732788577844932e-07, "loss": 0.0021, "reward": 1.5021183490753174, "reward_std": 0.14113232493400574, "rewards/accuracy_reward": 0.5021182894706726, "rewards/format_reward": 1.0, "step": 269 }, { "all_correct": 0.375, "all_wrong": 0.15625, "completion_length": 86.44140625, "epoch": 0.5103969754253308, "grad_norm": 3.9170636810231727, "kl": 0.050537109375, "learning_rate": 9.673427459782974e-07, "loss": 0.002, "reward": 1.5727362632751465, "reward_std": 0.18539920449256897, "rewards/accuracy_reward": 0.5727362036705017, "rewards/format_reward": 1.0, "step": 270 }, { "all_correct": 0.4375, "all_wrong": 0.1875, "completion_length": 87.015625, "epoch": 0.5122873345935728, "grad_norm": 1.366883049982446, "kl": 0.05078125, "learning_rate": 9.61407785944825e-07, "loss": 0.002, "reward": 1.6010971069335938, "reward_std": 0.1428610235452652, "rewards/accuracy_reward": 0.6050034761428833, "rewards/format_reward": 0.99609375, "step": 271 }, { "all_correct": 0.25, "all_wrong": 0.28125, "completion_length": 90.48046875, "epoch": 0.5141776937618148, "grad_norm": 3.3253184693080673, "kl": 0.053955078125, "learning_rate": 9.554741870012795e-07, "loss": 0.0022, "reward": 1.478670358657837, "reward_std": 0.16775619983673096, "rewards/accuracy_reward": 0.4786703884601593, "rewards/format_reward": 1.0, "step": 272 }, { "all_correct": 0.375, "all_wrong": 0.21875, "completion_length": 86.51171875, "epoch": 0.5160680529300568, "grad_norm": 1.549632672045942, "kl": 0.048095703125, "learning_rate": 9.495421584168608e-07, "loss": 0.0019, "reward": 1.594543695449829, "reward_std": 0.18858283758163452, "rewards/accuracy_reward": 0.5984500050544739, "rewards/format_reward": 0.99609375, "step": 273 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 84.9453125, "epoch": 0.5179584120982986, "grad_norm": 2.314889573705669, "kl": 0.050048828125, "learning_rate": 9.436119094053845e-07, "loss": 0.002, "reward": 1.5329444408416748, "reward_std": 0.19728565216064453, "rewards/accuracy_reward": 0.5329445004463196, "rewards/format_reward": 1.0, "step": 274 }, { "all_correct": 0.375, "all_wrong": 0.09375, "completion_length": 91.96484375, "epoch": 0.5198487712665406, "grad_norm": 3.0057549499090004, "kl": 0.046142578125, "learning_rate": 9.376836491179027e-07, "loss": 0.0018, "reward": 1.6376956701278687, "reward_std": 0.187890887260437, "rewards/accuracy_reward": 0.6376956701278687, "rewards/format_reward": 1.0, "step": 275 }, { "all_correct": 0.34375, "all_wrong": 0.25, "completion_length": 83.40234375, "epoch": 0.5217391304347826, "grad_norm": 2.381324526997287, "kl": 0.04931640625, "learning_rate": 9.317575866353291e-07, "loss": 0.002, "reward": 1.5451953411102295, "reward_std": 0.15440954267978668, "rewards/accuracy_reward": 0.5451953411102295, "rewards/format_reward": 1.0, "step": 276 }, { "all_correct": 0.25, "all_wrong": 0.1875, "completion_length": 93.62109375, "epoch": 0.5236294896030246, "grad_norm": 1.6038678148688907, "kl": 0.046630859375, "learning_rate": 9.258339309610636e-07, "loss": 0.0019, "reward": 1.5747730731964111, "reward_std": 0.24761907756328583, "rewards/accuracy_reward": 0.5786792635917664, "rewards/format_reward": 0.99609375, "step": 277 }, { "all_correct": 0.21875, "all_wrong": 0.21875, "completion_length": 92.5625, "epoch": 0.5255198487712666, "grad_norm": 2.367357867898568, "kl": 0.049072265625, "learning_rate": 9.199128910136218e-07, "loss": 0.002, "reward": 1.4614596366882324, "reward_std": 0.24768300354480743, "rewards/accuracy_reward": 0.4653658866882324, "rewards/format_reward": 0.99609375, "step": 278 }, { "all_correct": 0.375, "all_wrong": 0.15625, "completion_length": 89.54296875, "epoch": 0.5274102079395085, "grad_norm": 2.2126449160143147, "kl": 0.043701171875, "learning_rate": 9.139946756192662e-07, "loss": 0.0018, "reward": 1.5789459943771362, "reward_std": 0.17482253909111023, "rewards/accuracy_reward": 0.5789459943771362, "rewards/format_reward": 1.0, "step": 279 }, { "all_correct": 0.25, "all_wrong": 0.21875, "completion_length": 91.51953125, "epoch": 0.5293005671077504, "grad_norm": 1.7672859098305684, "kl": 0.051025390625, "learning_rate": 9.08079493504642e-07, "loss": 0.002, "reward": 1.5148652791976929, "reward_std": 0.21231761574745178, "rewards/accuracy_reward": 0.5148652791976929, "rewards/format_reward": 1.0, "step": 280 }, { "all_correct": 0.46875, "all_wrong": 0.125, "completion_length": 89.3828125, "epoch": 0.5311909262759924, "grad_norm": 1.4247234430692624, "kl": 0.046875, "learning_rate": 9.021675532894144e-07, "loss": 0.0019, "reward": 1.6473538875579834, "reward_std": 0.14679107069969177, "rewards/accuracy_reward": 0.6512601971626282, "rewards/format_reward": 0.99609375, "step": 281 }, { "all_correct": 0.375, "all_wrong": 0.1875, "completion_length": 85.79296875, "epoch": 0.5330812854442344, "grad_norm": 1.3257824153478301, "kl": 0.048828125, "learning_rate": 8.962590634789123e-07, "loss": 0.002, "reward": 1.6150450706481934, "reward_std": 0.1819513887166977, "rewards/accuracy_reward": 0.6150450706481934, "rewards/format_reward": 1.0, "step": 282 }, { "all_correct": 0.3125, "all_wrong": 0.1875, "completion_length": 85.34375, "epoch": 0.5349716446124764, "grad_norm": 1.6854728753752273, "kl": 0.05078125, "learning_rate": 8.903542324567735e-07, "loss": 0.002, "reward": 1.5195235013961792, "reward_std": 0.20223002135753632, "rewards/accuracy_reward": 0.5390547513961792, "rewards/format_reward": 0.98046875, "step": 283 }, { "all_correct": 0.34375, "all_wrong": 0.15625, "completion_length": 91.28515625, "epoch": 0.5368620037807184, "grad_norm": 1.8405481449092091, "kl": 0.05419921875, "learning_rate": 8.844532684775963e-07, "loss": 0.0022, "reward": 1.592590093612671, "reward_std": 0.1902393102645874, "rewards/accuracy_reward": 0.5925900340080261, "rewards/format_reward": 1.0, "step": 284 }, { "all_correct": 0.34375, "all_wrong": 0.15625, "completion_length": 92.78125, "epoch": 0.5387523629489603, "grad_norm": 3.7950521224218687, "kl": 0.044189453125, "learning_rate": 8.785563796595938e-07, "loss": 0.0018, "reward": 1.6031997203826904, "reward_std": 0.19717274606227875, "rewards/accuracy_reward": 0.6031997203826904, "rewards/format_reward": 1.0, "step": 285 }, { "all_correct": 0.46875, "all_wrong": 0.25, "completion_length": 81.3046875, "epoch": 0.5406427221172023, "grad_norm": 2.3590193160665893, "kl": 0.061767578125, "learning_rate": 8.726637739772541e-07, "loss": 0.0025, "reward": 1.675480842590332, "reward_std": 0.10290536284446716, "rewards/accuracy_reward": 0.6754807829856873, "rewards/format_reward": 1.0, "step": 286 }, { "all_correct": 0.40625, "all_wrong": 0.28125, "completion_length": 84.125, "epoch": 0.5425330812854442, "grad_norm": 1.2126067200747002, "kl": 0.055908203125, "learning_rate": 8.667756592540063e-07, "loss": 0.0022, "reward": 1.5611882209777832, "reward_std": 0.11617802083492279, "rewards/accuracy_reward": 0.5611881017684937, "rewards/format_reward": 1.0, "step": 287 }, { "all_correct": 0.375, "all_wrong": 0.15625, "completion_length": 89.6796875, "epoch": 0.5444234404536862, "grad_norm": 1.966929179296431, "kl": 0.045166015625, "learning_rate": 8.608922431548887e-07, "loss": 0.0018, "reward": 1.6376736164093018, "reward_std": 0.18406596779823303, "rewards/accuracy_reward": 0.6376736760139465, "rewards/format_reward": 1.0, "step": 288 }, { "all_correct": 0.34375, "all_wrong": 0.125, "completion_length": 94.79296875, "epoch": 0.5463137996219282, "grad_norm": 1.756150509577632, "kl": 0.0400390625, "learning_rate": 8.550137331792269e-07, "loss": 0.0016, "reward": 1.6595051288604736, "reward_std": 0.24478332698345184, "rewards/accuracy_reward": 0.6673176884651184, "rewards/format_reward": 0.9921875, "step": 289 }, { "all_correct": 0.25, "all_wrong": 0.34375, "completion_length": 87.40625, "epoch": 0.5482041587901701, "grad_norm": 1.9775373962167517, "kl": 0.052978515625, "learning_rate": 8.49140336653315e-07, "loss": 0.0021, "reward": 1.4470252990722656, "reward_std": 0.17483514547348022, "rewards/accuracy_reward": 0.4470253586769104, "rewards/format_reward": 1.0, "step": 290 }, { "all_correct": 0.40625, "all_wrong": 0.125, "completion_length": 88.69921875, "epoch": 0.5500945179584121, "grad_norm": 2.063086280742146, "kl": 0.051025390625, "learning_rate": 8.432722607231029e-07, "loss": 0.002, "reward": 1.6172977685928345, "reward_std": 0.20396284759044647, "rewards/accuracy_reward": 0.6172977685928345, "rewards/format_reward": 1.0, "step": 291 }, { "all_correct": 0.28125, "all_wrong": 0.0625, "completion_length": 99.53515625, "epoch": 0.5519848771266541, "grad_norm": 2.621542277949528, "kl": 0.04150390625, "learning_rate": 8.374097123468917e-07, "loss": 0.0017, "reward": 1.5667483806610107, "reward_std": 0.24786125123500824, "rewards/accuracy_reward": 0.5706546306610107, "rewards/format_reward": 0.99609375, "step": 292 }, { "all_correct": 0.28125, "all_wrong": 0.15625, "completion_length": 86.796875, "epoch": 0.553875236294896, "grad_norm": 2.8165649295162316, "kl": 0.048095703125, "learning_rate": 8.315528982880337e-07, "loss": 0.0019, "reward": 1.5577614307403564, "reward_std": 0.25827598571777344, "rewards/accuracy_reward": 0.5655738711357117, "rewards/format_reward": 0.9921875, "step": 293 }, { "all_correct": 0.28125, "all_wrong": 0.1875, "completion_length": 87.5390625, "epoch": 0.555765595463138, "grad_norm": 2.0837535994437473, "kl": 0.0439453125, "learning_rate": 8.257020251076392e-07, "loss": 0.0018, "reward": 1.5403378009796143, "reward_std": 0.2276870459318161, "rewards/accuracy_reward": 0.5403377413749695, "rewards/format_reward": 1.0, "step": 294 }, { "all_correct": 0.34375, "all_wrong": 0.15625, "completion_length": 93.94921875, "epoch": 0.55765595463138, "grad_norm": 3.9566867940245594, "kl": 0.05029296875, "learning_rate": 8.198572991572939e-07, "loss": 0.002, "reward": 1.6211934089660645, "reward_std": 0.21309423446655273, "rewards/accuracy_reward": 0.6290059089660645, "rewards/format_reward": 0.9921875, "step": 295 }, { "all_correct": 0.21875, "all_wrong": 0.28125, "completion_length": 100.11328125, "epoch": 0.5595463137996219, "grad_norm": 1.6039871922459095, "kl": 0.046142578125, "learning_rate": 8.140189265717793e-07, "loss": 0.0018, "reward": 1.3850700855255127, "reward_std": 0.21388718485832214, "rewards/accuracy_reward": 0.42413264513015747, "rewards/format_reward": 0.9609375, "step": 296 }, { "all_correct": 0.34375, "all_wrong": 0.34375, "completion_length": 92.2734375, "epoch": 0.5614366729678639, "grad_norm": 1.4495763345559223, "kl": 0.05126953125, "learning_rate": 8.081871132618035e-07, "loss": 0.0021, "reward": 1.4881727695465088, "reward_std": 0.1367965191602707, "rewards/accuracy_reward": 0.4920789897441864, "rewards/format_reward": 0.99609375, "step": 297 }, { "all_correct": 0.40625, "all_wrong": 0.09375, "completion_length": 78.7109375, "epoch": 0.5633270321361059, "grad_norm": 2.195186142439845, "kl": 0.054443359375, "learning_rate": 8.023620649067383e-07, "loss": 0.0022, "reward": 1.6418862342834473, "reward_std": 0.17705166339874268, "rewards/accuracy_reward": 0.6418863534927368, "rewards/format_reward": 1.0, "step": 298 }, { "all_correct": 0.3125, "all_wrong": 0.25, "completion_length": 95.70703125, "epoch": 0.5652173913043478, "grad_norm": 1.5458362887848978, "kl": 0.044677734375, "learning_rate": 7.965439869473663e-07, "loss": 0.0018, "reward": 1.5705434083938599, "reward_std": 0.1792255938053131, "rewards/accuracy_reward": 0.5705434083938599, "rewards/format_reward": 1.0, "step": 299 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 91.9296875, "epoch": 0.5671077504725898, "grad_norm": 2.0503603995341266, "kl": 0.04541015625, "learning_rate": 7.907330845786337e-07, "loss": 0.0018, "reward": 1.5330439805984497, "reward_std": 0.1628941148519516, "rewards/accuracy_reward": 0.5564814805984497, "rewards/format_reward": 0.9765625, "step": 300 }, { "all_correct": 0.40625, "all_wrong": 0.15625, "completion_length": 86.5078125, "epoch": 0.5689981096408318, "grad_norm": 1.4513016708834336, "kl": 0.0478515625, "learning_rate": 7.849295627424147e-07, "loss": 0.0019, "reward": 1.6002389192581177, "reward_std": 0.16803400218486786, "rewards/accuracy_reward": 0.6158639192581177, "rewards/format_reward": 0.984375, "step": 301 }, { "all_correct": 0.40625, "all_wrong": 0.15625, "completion_length": 85.36328125, "epoch": 0.5708884688090737, "grad_norm": 2.0386539297024187, "kl": 0.050537109375, "learning_rate": 7.791336261202834e-07, "loss": 0.002, "reward": 1.6394249200820923, "reward_std": 0.18532907962799072, "rewards/accuracy_reward": 0.6550499200820923, "rewards/format_reward": 0.984375, "step": 302 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 91.37109375, "epoch": 0.5727788279773157, "grad_norm": 1.7416561634122243, "kl": 0.047119140625, "learning_rate": 7.733454791262945e-07, "loss": 0.0019, "reward": 1.5273735523223877, "reward_std": 0.1830909550189972, "rewards/accuracy_reward": 0.5273735523223877, "rewards/format_reward": 1.0, "step": 303 }, { "all_correct": 0.34375, "all_wrong": 0.1875, "completion_length": 93.0390625, "epoch": 0.5746691871455577, "grad_norm": 1.4257574938910071, "kl": 0.04931640625, "learning_rate": 7.67565325899774e-07, "loss": 0.002, "reward": 1.6209030151367188, "reward_std": 0.1880435049533844, "rewards/accuracy_reward": 0.6209030747413635, "rewards/format_reward": 1.0, "step": 304 }, { "all_correct": 0.375, "all_wrong": 0.09375, "completion_length": 94.640625, "epoch": 0.5765595463137996, "grad_norm": 2.523907663059568, "kl": 0.0478515625, "learning_rate": 7.617933702981197e-07, "loss": 0.0019, "reward": 1.646308183670044, "reward_std": 0.20141802728176117, "rewards/accuracy_reward": 0.650214433670044, "rewards/format_reward": 0.99609375, "step": 305 }, { "all_correct": 0.3125, "all_wrong": 0.28125, "completion_length": 88.98828125, "epoch": 0.5784499054820416, "grad_norm": 1.8869992601885726, "kl": 0.052978515625, "learning_rate": 7.560298158896114e-07, "loss": 0.0021, "reward": 1.5234375, "reward_std": 0.19467194378376007, "rewards/accuracy_reward": 0.52734375, "rewards/format_reward": 0.99609375, "step": 306 }, { "all_correct": 0.4375, "all_wrong": 0.125, "completion_length": 91.984375, "epoch": 0.5803402646502835, "grad_norm": 1.593500363249208, "kl": 0.050537109375, "learning_rate": 7.50274865946231e-07, "loss": 0.002, "reward": 1.6522129774093628, "reward_std": 0.15789154171943665, "rewards/accuracy_reward": 0.6522129774093628, "rewards/format_reward": 1.0, "step": 307 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 89.9140625, "epoch": 0.5822306238185255, "grad_norm": 1.5345766665959373, "kl": 0.046630859375, "learning_rate": 7.445287234364945e-07, "loss": 0.0019, "reward": 1.6170084476470947, "reward_std": 0.1939898431301117, "rewards/accuracy_reward": 0.61700838804245, "rewards/format_reward": 1.0, "step": 308 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 91.09375, "epoch": 0.5841209829867675, "grad_norm": 1.633714349616306, "kl": 0.04541015625, "learning_rate": 7.38791591018292e-07, "loss": 0.0018, "reward": 1.5211251974105835, "reward_std": 0.19408775866031647, "rewards/accuracy_reward": 0.5367502570152283, "rewards/format_reward": 0.984375, "step": 309 }, { "all_correct": 0.375, "all_wrong": 0.1875, "completion_length": 94.28515625, "epoch": 0.5860113421550095, "grad_norm": 1.7832966629730098, "kl": 0.044677734375, "learning_rate": 7.330636710317417e-07, "loss": 0.0018, "reward": 1.6072568893432617, "reward_std": 0.1647563874721527, "rewards/accuracy_reward": 0.6189756989479065, "rewards/format_reward": 0.98828125, "step": 310 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 89.41796875, "epoch": 0.5879017013232514, "grad_norm": 2.1849079934738294, "kl": 0.047607421875, "learning_rate": 7.27345165492053e-07, "loss": 0.0019, "reward": 1.5788297653198242, "reward_std": 0.18523138761520386, "rewards/accuracy_reward": 0.5788298845291138, "rewards/format_reward": 1.0, "step": 311 }, { "all_correct": 0.375, "all_wrong": 0.1875, "completion_length": 98.234375, "epoch": 0.5897920604914934, "grad_norm": 1.432954670732317, "kl": 0.044189453125, "learning_rate": 7.216362760824009e-07, "loss": 0.0018, "reward": 1.6121280193328857, "reward_std": 0.17768144607543945, "rewards/accuracy_reward": 0.612127959728241, "rewards/format_reward": 1.0, "step": 312 }, { "all_correct": 0.46875, "all_wrong": 0.125, "completion_length": 85.05078125, "epoch": 0.5916824196597353, "grad_norm": 1.7095977165647167, "kl": 0.0517578125, "learning_rate": 7.159372041468149e-07, "loss": 0.0021, "reward": 1.6863864660263062, "reward_std": 0.14531907439231873, "rewards/accuracy_reward": 0.6863864660263062, "rewards/format_reward": 1.0, "step": 313 }, { "all_correct": 0.25, "all_wrong": 0.21875, "completion_length": 89.55078125, "epoch": 0.5935727788279773, "grad_norm": 1.7588812930279742, "kl": 0.0439453125, "learning_rate": 7.102481506830763e-07, "loss": 0.0018, "reward": 1.4836997985839844, "reward_std": 0.21439874172210693, "rewards/accuracy_reward": 0.48369988799095154, "rewards/format_reward": 1.0, "step": 314 }, { "all_correct": 0.28125, "all_wrong": 0.28125, "completion_length": 93.41015625, "epoch": 0.5954631379962193, "grad_norm": 2.719325851548355, "kl": 0.04052734375, "learning_rate": 7.045693163356299e-07, "loss": 0.0016, "reward": 1.5147807598114014, "reward_std": 0.16246028244495392, "rewards/accuracy_reward": 0.5147807002067566, "rewards/format_reward": 1.0, "step": 315 }, { "all_correct": 0.21875, "all_wrong": 0.28125, "completion_length": 101.6171875, "epoch": 0.5973534971644613, "grad_norm": 12.827054251723668, "kl": 0.046875, "learning_rate": 6.989009013885076e-07, "loss": 0.0019, "reward": 1.442307710647583, "reward_std": 0.24046628177165985, "rewards/accuracy_reward": 0.457932710647583, "rewards/format_reward": 0.984375, "step": 316 }, { "all_correct": 0.46875, "all_wrong": 0.1875, "completion_length": 87.93359375, "epoch": 0.5992438563327032, "grad_norm": 1.6360722613389922, "kl": 0.055908203125, "learning_rate": 6.932431057582646e-07, "loss": 0.0022, "reward": 1.6484375, "reward_std": 0.16018126904964447, "rewards/accuracy_reward": 0.6484375, "rewards/format_reward": 1.0, "step": 317 }, { "all_correct": 0.3125, "all_wrong": 0.1875, "completion_length": 104.6484375, "epoch": 0.6011342155009451, "grad_norm": 1.8443996597658288, "kl": 0.04345703125, "learning_rate": 6.875961289869282e-07, "loss": 0.0017, "reward": 1.5236172676086426, "reward_std": 0.20754508674144745, "rewards/accuracy_reward": 0.5236173272132874, "rewards/format_reward": 1.0, "step": 318 }, { "all_correct": 0.4375, "all_wrong": 0.15625, "completion_length": 83.109375, "epoch": 0.6030245746691871, "grad_norm": 2.136504314601804, "kl": 0.051025390625, "learning_rate": 6.819601702349608e-07, "loss": 0.002, "reward": 1.6519629955291748, "reward_std": 0.15085574984550476, "rewards/accuracy_reward": 0.6519629955291748, "rewards/format_reward": 1.0, "step": 319 }, { "all_correct": 0.46875, "all_wrong": 0.0625, "completion_length": 93.57421875, "epoch": 0.6049149338374291, "grad_norm": 1.4937943209360285, "kl": 0.042236328125, "learning_rate": 6.763354282742362e-07, "loss": 0.0017, "reward": 1.628268837928772, "reward_std": 0.17435705661773682, "rewards/accuracy_reward": 0.636081337928772, "rewards/format_reward": 0.9921875, "step": 320 }, { "all_correct": 0.34375, "all_wrong": 0.15625, "completion_length": 90.71875, "epoch": 0.6068052930056711, "grad_norm": 8.501242318953185, "kl": 0.051513671875, "learning_rate": 6.707221014810278e-07, "loss": 0.0021, "reward": 1.5977280139923096, "reward_std": 0.18674521148204803, "rewards/accuracy_reward": 0.59772789478302, "rewards/format_reward": 1.0, "step": 321 }, { "all_correct": 0.53125, "all_wrong": 0.15625, "completion_length": 100.10546875, "epoch": 0.6086956521739131, "grad_norm": 1.126898780019968, "kl": 0.046875, "learning_rate": 6.651203878290138e-07, "loss": 0.0019, "reward": 1.6740057468414307, "reward_std": 0.13990236818790436, "rewards/accuracy_reward": 0.6740056872367859, "rewards/format_reward": 1.0, "step": 322 }, { "all_correct": 0.5, "all_wrong": 0.25, "completion_length": 87.5625, "epoch": 0.610586011342155, "grad_norm": 1.0829161392709314, "kl": 0.0537109375, "learning_rate": 6.59530484882294e-07, "loss": 0.0021, "reward": 1.6339197158813477, "reward_std": 0.10196228325366974, "rewards/accuracy_reward": 0.6339195966720581, "rewards/format_reward": 1.0, "step": 323 }, { "all_correct": 0.40625, "all_wrong": 0.21875, "completion_length": 85.04296875, "epoch": 0.6124763705103969, "grad_norm": 1.5071308132082326, "kl": 0.048095703125, "learning_rate": 6.539525897884218e-07, "loss": 0.0019, "reward": 1.601670503616333, "reward_std": 0.11899926513433456, "rewards/accuracy_reward": 0.6016704440116882, "rewards/format_reward": 1.0, "step": 324 }, { "all_correct": 0.28125, "all_wrong": 0.25, "completion_length": 105.0546875, "epoch": 0.6143667296786389, "grad_norm": 1.3796535779838177, "kl": 0.04638671875, "learning_rate": 6.48386899271452e-07, "loss": 0.0019, "reward": 1.5369006395339966, "reward_std": 0.1829143464565277, "rewards/accuracy_reward": 0.5486193895339966, "rewards/format_reward": 0.98828125, "step": 325 }, { "all_correct": 0.53125, "all_wrong": 0.21875, "completion_length": 91.96875, "epoch": 0.6162570888468809, "grad_norm": 1.0665567955819852, "kl": 0.043701171875, "learning_rate": 6.428336096250017e-07, "loss": 0.0018, "reward": 1.632015585899353, "reward_std": 0.08141334354877472, "rewards/accuracy_reward": 0.6320155262947083, "rewards/format_reward": 1.0, "step": 326 }, { "all_correct": 0.3125, "all_wrong": 0.15625, "completion_length": 98.70703125, "epoch": 0.6181474480151229, "grad_norm": 1.8190601838814684, "kl": 0.0498046875, "learning_rate": 6.372929167053285e-07, "loss": 0.002, "reward": 1.5398609638214111, "reward_std": 0.19976115226745605, "rewards/accuracy_reward": 0.5398609638214111, "rewards/format_reward": 1.0, "step": 327 }, { "all_correct": 0.46875, "all_wrong": 0.09375, "completion_length": 96.3984375, "epoch": 0.6200378071833649, "grad_norm": 10.48494019650004, "kl": 0.05029296875, "learning_rate": 6.317650159244212e-07, "loss": 0.002, "reward": 1.611169457435608, "reward_std": 0.17813417315483093, "rewards/accuracy_reward": 0.6150757074356079, "rewards/format_reward": 0.99609375, "step": 328 }, { "all_correct": 0.5, "all_wrong": 0.125, "completion_length": 90.19921875, "epoch": 0.6219281663516069, "grad_norm": 3.3256774582394724, "kl": 0.05029296875, "learning_rate": 6.262501022431099e-07, "loss": 0.002, "reward": 1.6589438915252686, "reward_std": 0.12664872407913208, "rewards/accuracy_reward": 0.6628501415252686, "rewards/format_reward": 0.99609375, "step": 329 }, { "all_correct": 0.40625, "all_wrong": 0.03125, "completion_length": 100.25390625, "epoch": 0.6238185255198487, "grad_norm": 1.4299757860972675, "kl": 0.04052734375, "learning_rate": 6.207483701641887e-07, "loss": 0.0016, "reward": 1.7447913885116577, "reward_std": 0.18475459516048431, "rewards/accuracy_reward": 0.7447913885116577, "rewards/format_reward": 1.0, "step": 330 }, { "all_correct": 0.34375, "all_wrong": 0.375, "completion_length": 91.765625, "epoch": 0.6257088846880907, "grad_norm": 1.1358781168676717, "kl": 0.0478515625, "learning_rate": 6.15260013725555e-07, "loss": 0.0019, "reward": 1.4526742696762085, "reward_std": 0.11981412023305893, "rewards/accuracy_reward": 0.4526742696762085, "rewards/format_reward": 1.0, "step": 331 }, { "all_correct": 0.40625, "all_wrong": 0.15625, "completion_length": 98.609375, "epoch": 0.6275992438563327, "grad_norm": 2.5447540141563048, "kl": 0.048828125, "learning_rate": 6.097852264933696e-07, "loss": 0.002, "reward": 1.619698166847229, "reward_std": 0.16045129299163818, "rewards/accuracy_reward": 0.6275107860565186, "rewards/format_reward": 0.9921875, "step": 332 }, { "all_correct": 0.3125, "all_wrong": 0.25, "completion_length": 91.7265625, "epoch": 0.6294896030245747, "grad_norm": 1.5705944493635686, "kl": 0.0478515625, "learning_rate": 6.043242015552256e-07, "loss": 0.0019, "reward": 1.52734375, "reward_std": 0.21317726373672485, "rewards/accuracy_reward": 0.52734375, "rewards/format_reward": 1.0, "step": 333 }, { "all_correct": 0.375, "all_wrong": 0.1875, "completion_length": 93.20703125, "epoch": 0.6313799621928167, "grad_norm": 1.4532227455775275, "kl": 0.039306640625, "learning_rate": 5.988771315133417e-07, "loss": 0.0016, "reward": 1.5797886848449707, "reward_std": 0.17210961878299713, "rewards/accuracy_reward": 0.5797887444496155, "rewards/format_reward": 1.0, "step": 334 }, { "all_correct": 0.40625, "all_wrong": 0.15625, "completion_length": 89.7421875, "epoch": 0.6332703213610587, "grad_norm": 1.5032464010247504, "kl": 0.04150390625, "learning_rate": 5.934442084777675e-07, "loss": 0.0017, "reward": 1.6588280200958252, "reward_std": 0.17925553023815155, "rewards/accuracy_reward": 0.6588280200958252, "rewards/format_reward": 1.0, "step": 335 }, { "all_correct": 0.375, "all_wrong": 0.21875, "completion_length": 89.48046875, "epoch": 0.6351606805293005, "grad_norm": 3.9562339582094346, "kl": 0.056640625, "learning_rate": 5.880256240596095e-07, "loss": 0.0023, "reward": 1.5489052534103394, "reward_std": 0.1494232714176178, "rewards/accuracy_reward": 0.5489052534103394, "rewards/format_reward": 1.0, "step": 336 }, { "all_correct": 0.40625, "all_wrong": 0.21875, "completion_length": 91.6171875, "epoch": 0.6370510396975425, "grad_norm": 4.504748528156339, "kl": 0.0537109375, "learning_rate": 5.826215693642709e-07, "loss": 0.0021, "reward": 1.574186086654663, "reward_std": 0.14129537343978882, "rewards/accuracy_reward": 0.5780923366546631, "rewards/format_reward": 0.99609375, "step": 337 }, { "all_correct": 0.53125, "all_wrong": 0.25, "completion_length": 96.40234375, "epoch": 0.6389413988657845, "grad_norm": 0.9918255765496156, "kl": 0.046875, "learning_rate": 5.772322349847153e-07, "loss": 0.0019, "reward": 1.65234375, "reward_std": 0.09954919666051865, "rewards/accuracy_reward": 0.65234375, "rewards/format_reward": 1.0, "step": 338 }, { "all_correct": 0.40625, "all_wrong": 0.21875, "completion_length": 90.94921875, "epoch": 0.6408317580340265, "grad_norm": 2.0285380237709387, "kl": 0.050537109375, "learning_rate": 5.718578109947409e-07, "loss": 0.002, "reward": 1.608215093612671, "reward_std": 0.10201030969619751, "rewards/accuracy_reward": 0.6082150340080261, "rewards/format_reward": 1.0, "step": 339 }, { "all_correct": 0.3125, "all_wrong": 0.21875, "completion_length": 87.05078125, "epoch": 0.6427221172022685, "grad_norm": 3.3176011557910097, "kl": 0.053466796875, "learning_rate": 5.664984869422802e-07, "loss": 0.0021, "reward": 1.5531736612319946, "reward_std": 0.17685286700725555, "rewards/accuracy_reward": 0.5531736612319946, "rewards/format_reward": 1.0, "step": 340 }, { "all_correct": 0.4375, "all_wrong": 0.09375, "completion_length": 100.0234375, "epoch": 0.6446124763705104, "grad_norm": 1.9481732665617166, "kl": 0.049072265625, "learning_rate": 5.611544518427121e-07, "loss": 0.002, "reward": 1.6415621042251587, "reward_std": 0.163002610206604, "rewards/accuracy_reward": 0.6493746042251587, "rewards/format_reward": 0.9921875, "step": 341 }, { "all_correct": 0.34375, "all_wrong": 0.21875, "completion_length": 91.18359375, "epoch": 0.6465028355387523, "grad_norm": 1.9147871985701481, "kl": 0.04931640625, "learning_rate": 5.558258941721981e-07, "loss": 0.002, "reward": 1.5601630210876465, "reward_std": 0.17341557145118713, "rewards/accuracy_reward": 0.5640692710876465, "rewards/format_reward": 0.99609375, "step": 342 }, { "all_correct": 0.40625, "all_wrong": 0.09375, "completion_length": 86.3515625, "epoch": 0.6483931947069943, "grad_norm": 2.2167075436448465, "kl": 0.05126953125, "learning_rate": 5.505130018610321e-07, "loss": 0.002, "reward": 1.6853443384170532, "reward_std": 0.20850321650505066, "rewards/accuracy_reward": 0.7009693384170532, "rewards/format_reward": 0.984375, "step": 343 }, { "all_correct": 0.5, "all_wrong": 0.1875, "completion_length": 83.95703125, "epoch": 0.6502835538752363, "grad_norm": 2.499264160160712, "kl": 0.055908203125, "learning_rate": 5.452159622870157e-07, "loss": 0.0022, "reward": 1.6708264350891113, "reward_std": 0.09797890484333038, "rewards/accuracy_reward": 0.6708264350891113, "rewards/format_reward": 1.0, "step": 344 }, { "all_correct": 0.375, "all_wrong": 0.21875, "completion_length": 98.9375, "epoch": 0.6521739130434783, "grad_norm": 1.1468871398817573, "kl": 0.051025390625, "learning_rate": 5.399349622688478e-07, "loss": 0.002, "reward": 1.5537773370742798, "reward_std": 0.17344093322753906, "rewards/accuracy_reward": 0.5772148370742798, "rewards/format_reward": 0.9765625, "step": 345 }, { "all_correct": 0.4375, "all_wrong": 0.1875, "completion_length": 91.84765625, "epoch": 0.6540642722117203, "grad_norm": 1.365593331621954, "kl": 0.0498046875, "learning_rate": 5.346701880595353e-07, "loss": 0.002, "reward": 1.6378886699676514, "reward_std": 0.14274653792381287, "rewards/accuracy_reward": 0.6378886699676514, "rewards/format_reward": 1.0, "step": 346 }, { "all_correct": 0.4375, "all_wrong": 0.25, "completion_length": 94.91015625, "epoch": 0.6559546313799622, "grad_norm": 1.5634515651271659, "kl": 0.045654296875, "learning_rate": 5.29421825339826e-07, "loss": 0.0018, "reward": 1.589550495147705, "reward_std": 0.13411790132522583, "rewards/accuracy_reward": 0.5973629951477051, "rewards/format_reward": 0.9921875, "step": 347 }, { "all_correct": 0.4375, "all_wrong": 0.25, "completion_length": 90.8359375, "epoch": 0.6578449905482041, "grad_norm": 1.1226897308498045, "kl": 0.0458984375, "learning_rate": 5.241900592116579e-07, "loss": 0.0018, "reward": 1.5573174953460693, "reward_std": 0.09218928962945938, "rewards/accuracy_reward": 0.5573174953460693, "rewards/format_reward": 1.0, "step": 348 }, { "all_correct": 0.375, "all_wrong": 0.1875, "completion_length": 94.4453125, "epoch": 0.6597353497164461, "grad_norm": 1.4290088468656539, "kl": 0.051513671875, "learning_rate": 5.189750741916326e-07, "loss": 0.0021, "reward": 1.623161792755127, "reward_std": 0.20275253057479858, "rewards/accuracy_reward": 0.642693042755127, "rewards/format_reward": 0.98046875, "step": 349 }, { "all_correct": 0.34375, "all_wrong": 0.03125, "completion_length": 96.1015625, "epoch": 0.6616257088846881, "grad_norm": 1.8681119609391044, "kl": 0.048583984375, "learning_rate": 5.137770542045062e-07, "loss": 0.0019, "reward": 1.688063144683838, "reward_std": 0.23905277252197266, "rewards/accuracy_reward": 0.6997818946838379, "rewards/format_reward": 0.98828125, "step": 350 }, { "all_correct": 0.40625, "all_wrong": 0.125, "completion_length": 84.9453125, "epoch": 0.6635160680529301, "grad_norm": 1.4728027178236136, "kl": 0.04833984375, "learning_rate": 5.085961825767049e-07, "loss": 0.0019, "reward": 1.674993872642517, "reward_std": 0.17204201221466064, "rewards/accuracy_reward": 0.6789001226425171, "rewards/format_reward": 0.99609375, "step": 351 }, { "all_correct": 0.3125, "all_wrong": 0.21875, "completion_length": 86.9140625, "epoch": 0.665406427221172, "grad_norm": 3.785173448605038, "kl": 0.04443359375, "learning_rate": 5.034326420298557e-07, "loss": 0.0018, "reward": 1.5950738191604614, "reward_std": 0.18303368985652924, "rewards/accuracy_reward": 0.5950738191604614, "rewards/format_reward": 1.0, "step": 352 }, { "all_correct": 0.28125, "all_wrong": 0.375, "completion_length": 93.01953125, "epoch": 0.667296786389414, "grad_norm": 1.4357110637131671, "kl": 0.0458984375, "learning_rate": 4.982866146743464e-07, "loss": 0.0018, "reward": 1.3835440874099731, "reward_std": 0.16043886542320251, "rewards/accuracy_reward": 0.40307533740997314, "rewards/format_reward": 0.98046875, "step": 353 }, { "all_correct": 0.40625, "all_wrong": 0.0625, "completion_length": 91.83203125, "epoch": 0.6691871455576559, "grad_norm": 2.0314351942520203, "kl": 0.037841796875, "learning_rate": 4.93158282002899e-07, "loss": 0.0015, "reward": 1.691582441329956, "reward_std": 0.21125006675720215, "rewards/accuracy_reward": 0.6915825605392456, "rewards/format_reward": 1.0, "step": 354 }, { "all_correct": 0.28125, "all_wrong": 0.3125, "completion_length": 96.66015625, "epoch": 0.6710775047258979, "grad_norm": 1.360692718067674, "kl": 0.046630859375, "learning_rate": 4.880478248841706e-07, "loss": 0.0019, "reward": 1.4369994401931763, "reward_std": 0.17168085277080536, "rewards/accuracy_reward": 0.44090569019317627, "rewards/format_reward": 0.99609375, "step": 355 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 85.65234375, "epoch": 0.6729678638941399, "grad_norm": 2.116858815599637, "kl": 0.047607421875, "learning_rate": 4.82955423556375e-07, "loss": 0.0019, "reward": 1.5795400142669678, "reward_std": 0.18496635556221008, "rewards/accuracy_reward": 0.5795398950576782, "rewards/format_reward": 1.0, "step": 356 }, { "all_correct": 0.375, "all_wrong": 0.28125, "completion_length": 83.1875, "epoch": 0.6748582230623819, "grad_norm": 1.7162608457835138, "kl": 0.04833984375, "learning_rate": 4.778812576209241e-07, "loss": 0.0019, "reward": 1.5707752704620361, "reward_std": 0.09887948632240295, "rewards/accuracy_reward": 0.5707752108573914, "rewards/format_reward": 1.0, "step": 357 }, { "all_correct": 0.25, "all_wrong": 0.15625, "completion_length": 88.57421875, "epoch": 0.6767485822306238, "grad_norm": 2.366206137579118, "kl": 0.0439453125, "learning_rate": 4.728255060360955e-07, "loss": 0.0018, "reward": 1.6372836828231812, "reward_std": 0.17435956001281738, "rewards/accuracy_reward": 0.6372836828231812, "rewards/format_reward": 1.0, "step": 358 }, { "all_correct": 0.34375, "all_wrong": 0.1875, "completion_length": 86.61328125, "epoch": 0.6786389413988658, "grad_norm": 1.692332788365531, "kl": 0.0478515625, "learning_rate": 4.6778834711071924e-07, "loss": 0.0019, "reward": 1.5796103477478027, "reward_std": 0.15928372740745544, "rewards/accuracy_reward": 0.5796103477478027, "rewards/format_reward": 1.0, "step": 359 }, { "all_correct": 0.4375, "all_wrong": 0.15625, "completion_length": 90.30859375, "epoch": 0.6805293005671077, "grad_norm": 1.2973562747312646, "kl": 0.05029296875, "learning_rate": 4.627699584978911e-07, "loss": 0.002, "reward": 1.6404190063476562, "reward_std": 0.14580589532852173, "rewards/accuracy_reward": 0.640419065952301, "rewards/format_reward": 1.0, "step": 360 }, { "all_correct": 0.375, "all_wrong": 0.1875, "completion_length": 91.734375, "epoch": 0.6824196597353497, "grad_norm": 1.924682020333693, "kl": 0.045166015625, "learning_rate": 4.57770517188705e-07, "loss": 0.0018, "reward": 1.5807889699935913, "reward_std": 0.18111172318458557, "rewards/accuracy_reward": 0.5846952199935913, "rewards/format_reward": 0.99609375, "step": 361 }, { "all_correct": 0.4375, "all_wrong": 0.15625, "completion_length": 92.046875, "epoch": 0.6843100189035917, "grad_norm": 1.4869820010117951, "kl": 0.044921875, "learning_rate": 4.527901995060113e-07, "loss": 0.0018, "reward": 1.6711153984069824, "reward_std": 0.16736207902431488, "rewards/accuracy_reward": 0.6789278984069824, "rewards/format_reward": 0.9921875, "step": 362 }, { "all_correct": 0.15625, "all_wrong": 0.25, "completion_length": 91.83984375, "epoch": 0.6862003780718336, "grad_norm": 1.7329104320683526, "kl": 0.04150390625, "learning_rate": 4.4782918109819976e-07, "loss": 0.0017, "reward": 1.4670283794403076, "reward_std": 0.24669389426708221, "rewards/accuracy_reward": 0.4670283794403076, "rewards/format_reward": 1.0, "step": 363 }, { "all_correct": 0.40625, "all_wrong": 0.25, "completion_length": 93.6328125, "epoch": 0.6880907372400756, "grad_norm": 1.5230757953398353, "kl": 0.052490234375, "learning_rate": 4.4288763693300226e-07, "loss": 0.0021, "reward": 1.6058498620986938, "reward_std": 0.14699707925319672, "rewards/accuracy_reward": 0.6175686120986938, "rewards/format_reward": 0.98828125, "step": 364 }, { "all_correct": 0.40625, "all_wrong": 0.21875, "completion_length": 96.92578125, "epoch": 0.6899810964083176, "grad_norm": 4.290738008536141, "kl": 0.044677734375, "learning_rate": 4.3796574129132423e-07, "loss": 0.0018, "reward": 1.560653805732727, "reward_std": 0.18818287551403046, "rewards/accuracy_reward": 0.580185055732727, "rewards/format_reward": 0.98046875, "step": 365 }, { "all_correct": 0.34375, "all_wrong": 0.1875, "completion_length": 91.5859375, "epoch": 0.6918714555765595, "grad_norm": 2.5809981268174518, "kl": 0.0478515625, "learning_rate": 4.3306366776109616e-07, "loss": 0.0019, "reward": 1.5660797357559204, "reward_std": 0.1301419585943222, "rewards/accuracy_reward": 0.5660797357559204, "rewards/format_reward": 1.0, "step": 366 }, { "all_correct": 0.46875, "all_wrong": 0.09375, "completion_length": 92.61328125, "epoch": 0.6937618147448015, "grad_norm": 2.395698693198084, "kl": 0.047119140625, "learning_rate": 4.2818158923115244e-07, "loss": 0.0019, "reward": 1.6860473155975342, "reward_std": 0.17321570217609406, "rewards/accuracy_reward": 0.6860473155975342, "rewards/format_reward": 1.0, "step": 367 }, { "all_correct": 0.375, "all_wrong": 0.1875, "completion_length": 93.4375, "epoch": 0.6956521739130435, "grad_norm": 2.175508616602608, "kl": 0.044677734375, "learning_rate": 4.233196778851329e-07, "loss": 0.0018, "reward": 1.5961754322052002, "reward_std": 0.15679945051670074, "rewards/accuracy_reward": 0.5961754322052002, "rewards/format_reward": 1.0, "step": 368 }, { "all_correct": 0.375, "all_wrong": 0.21875, "completion_length": 100.703125, "epoch": 0.6975425330812854, "grad_norm": 2.3623237744377685, "kl": 0.043212890625, "learning_rate": 4.184781051954125e-07, "loss": 0.0017, "reward": 1.6343427896499634, "reward_std": 0.1671934723854065, "rewards/accuracy_reward": 0.6343427896499634, "rewards/format_reward": 1.0, "step": 369 }, { "all_correct": 0.3125, "all_wrong": 0.09375, "completion_length": 97.34765625, "epoch": 0.6994328922495274, "grad_norm": 1.8342117768253567, "kl": 0.0419921875, "learning_rate": 4.136570419170501e-07, "loss": 0.0017, "reward": 1.5979249477386475, "reward_std": 0.2753554880619049, "rewards/accuracy_reward": 0.6135499477386475, "rewards/format_reward": 0.984375, "step": 370 }, { "all_correct": 0.3125, "all_wrong": 0.15625, "completion_length": 96.44140625, "epoch": 0.7013232514177694, "grad_norm": 1.8499863828085692, "kl": 0.04345703125, "learning_rate": 4.088566580817694e-07, "loss": 0.0017, "reward": 1.5595123767852783, "reward_std": 0.18973296880722046, "rewards/accuracy_reward": 0.5634186267852783, "rewards/format_reward": 0.99609375, "step": 371 }, { "all_correct": 0.46875, "all_wrong": 0.125, "completion_length": 99.48828125, "epoch": 0.7032136105860114, "grad_norm": 2.208899799066235, "kl": 0.04541015625, "learning_rate": 4.040771229919612e-07, "loss": 0.0018, "reward": 1.6975526809692383, "reward_std": 0.17384248971939087, "rewards/accuracy_reward": 0.7053651809692383, "rewards/format_reward": 0.9921875, "step": 372 }, { "all_correct": 0.28125, "all_wrong": 0.21875, "completion_length": 87.18359375, "epoch": 0.7051039697542533, "grad_norm": 1.8987142352577209, "kl": 0.04638671875, "learning_rate": 3.9931860521471097e-07, "loss": 0.0019, "reward": 1.5160590410232544, "reward_std": 0.21702983975410461, "rewards/accuracy_reward": 0.5160590410232544, "rewards/format_reward": 1.0, "step": 373 }, { "all_correct": 0.40625, "all_wrong": 0.09375, "completion_length": 92.609375, "epoch": 0.7069943289224953, "grad_norm": 15.04217885199878, "kl": 0.046142578125, "learning_rate": 3.945812725758554e-07, "loss": 0.0018, "reward": 1.7074790000915527, "reward_std": 0.19710449874401093, "rewards/accuracy_reward": 0.7113852500915527, "rewards/format_reward": 0.99609375, "step": 374 }, { "all_correct": 0.375, "all_wrong": 0.21875, "completion_length": 92.65625, "epoch": 0.7088846880907372, "grad_norm": 1.91152067931555, "kl": 0.043701171875, "learning_rate": 3.898652921540627e-07, "loss": 0.0017, "reward": 1.549987554550171, "reward_std": 0.15819934010505676, "rewards/accuracy_reward": 0.5617063641548157, "rewards/format_reward": 0.98828125, "step": 375 }, { "all_correct": 0.25, "all_wrong": 0.28125, "completion_length": 89.109375, "epoch": 0.7107750472589792, "grad_norm": 1.585271716791016, "kl": 0.052001953125, "learning_rate": 3.851708302749409e-07, "loss": 0.0021, "reward": 1.4718488454818726, "reward_std": 0.15529434382915497, "rewards/accuracy_reward": 0.47184884548187256, "rewards/format_reward": 1.0, "step": 376 }, { "all_correct": 0.3125, "all_wrong": 0.21875, "completion_length": 91.43359375, "epoch": 0.7126654064272212, "grad_norm": 1.6292096929321715, "kl": 0.039794921875, "learning_rate": 3.8049805250517e-07, "loss": 0.0016, "reward": 1.5054469108581543, "reward_std": 0.1916845291852951, "rewards/accuracy_reward": 0.5054467916488647, "rewards/format_reward": 1.0, "step": 377 }, { "all_correct": 0.28125, "all_wrong": 0.09375, "completion_length": 87.43359375, "epoch": 0.7145557655954632, "grad_norm": 2.5257432847150865, "kl": 0.045654296875, "learning_rate": 3.7584712364666493e-07, "loss": 0.0018, "reward": 1.5939102172851562, "reward_std": 0.24795284867286682, "rewards/accuracy_reward": 0.5939102172851562, "rewards/format_reward": 1.0, "step": 378 }, { "all_correct": 0.1875, "all_wrong": 0.09375, "completion_length": 94.90625, "epoch": 0.7164461247637051, "grad_norm": 2.7673062583960317, "kl": 0.045166015625, "learning_rate": 3.7121820773076097e-07, "loss": 0.0018, "reward": 1.5857834815979004, "reward_std": 0.25405406951904297, "rewards/accuracy_reward": 0.5857834219932556, "rewards/format_reward": 1.0, "step": 379 }, { "all_correct": 0.28125, "all_wrong": 0.09375, "completion_length": 90.87109375, "epoch": 0.718336483931947, "grad_norm": 2.2822899165231294, "kl": 0.047119140625, "learning_rate": 3.666114680124298e-07, "loss": 0.0019, "reward": 1.5186080932617188, "reward_std": 0.1982262283563614, "rewards/accuracy_reward": 0.5186082124710083, "rewards/format_reward": 1.0, "step": 380 }, { "all_correct": 0.21875, "all_wrong": 0.21875, "completion_length": 87.87890625, "epoch": 0.720226843100189, "grad_norm": 2.6361171968305346, "kl": 0.047607421875, "learning_rate": 3.620270669645228e-07, "loss": 0.0019, "reward": 1.43359375, "reward_std": 0.2574925422668457, "rewards/accuracy_reward": 0.43359375, "rewards/format_reward": 1.0, "step": 381 }, { "all_correct": 0.25, "all_wrong": 0.1875, "completion_length": 85.53515625, "epoch": 0.722117202268431, "grad_norm": 1.934475962789802, "kl": 0.04736328125, "learning_rate": 3.5746516627203816e-07, "loss": 0.0019, "reward": 1.5397059917449951, "reward_std": 0.2019021213054657, "rewards/accuracy_reward": 0.5397060513496399, "rewards/format_reward": 1.0, "step": 382 }, { "all_correct": 0.34375, "all_wrong": 0.15625, "completion_length": 89.05859375, "epoch": 0.724007561436673, "grad_norm": 1.7765413930033112, "kl": 0.04248046875, "learning_rate": 3.529259268264213e-07, "loss": 0.0017, "reward": 1.6433665752410889, "reward_std": 0.1822570264339447, "rewards/accuracy_reward": 0.6433665752410889, "rewards/format_reward": 1.0, "step": 383 }, { "all_correct": 0.4375, "all_wrong": 0.21875, "completion_length": 90.70703125, "epoch": 0.725897920604915, "grad_norm": 3.235569329484398, "kl": 0.04296875, "learning_rate": 3.4840950871988806e-07, "loss": 0.0017, "reward": 1.6414120197296143, "reward_std": 0.1383558064699173, "rewards/accuracy_reward": 0.6414120197296143, "rewards/format_reward": 1.0, "step": 384 }, { "all_correct": 0.4375, "all_wrong": 0.15625, "completion_length": 84.59765625, "epoch": 0.7277882797731569, "grad_norm": 2.5415509345709726, "kl": 0.052978515625, "learning_rate": 3.4391607123978096e-07, "loss": 0.0021, "reward": 1.6038849353790283, "reward_std": 0.14968551695346832, "rewards/accuracy_reward": 0.6038850545883179, "rewards/format_reward": 1.0, "step": 385 }, { "all_correct": 0.28125, "all_wrong": 0.1875, "completion_length": 92.71484375, "epoch": 0.7296786389413988, "grad_norm": 1.9594609018418163, "kl": 0.04736328125, "learning_rate": 3.3944577286294886e-07, "loss": 0.0019, "reward": 1.5191731452941895, "reward_std": 0.21669438481330872, "rewards/accuracy_reward": 0.5308918952941895, "rewards/format_reward": 0.98828125, "step": 386 }, { "all_correct": 0.3125, "all_wrong": 0.21875, "completion_length": 90.5, "epoch": 0.7315689981096408, "grad_norm": 1.5282132767893477, "kl": 0.044189453125, "learning_rate": 3.3499877125015907e-07, "loss": 0.0018, "reward": 1.5483942031860352, "reward_std": 0.1809859573841095, "rewards/accuracy_reward": 0.5523004531860352, "rewards/format_reward": 0.99609375, "step": 387 }, { "all_correct": 0.34375, "all_wrong": 0.21875, "completion_length": 88.44921875, "epoch": 0.7334593572778828, "grad_norm": 1.511915251166267, "kl": 0.04443359375, "learning_rate": 3.305752232405377e-07, "loss": 0.0018, "reward": 1.5602011680603027, "reward_std": 0.1667328178882599, "rewards/accuracy_reward": 0.560201108455658, "rewards/format_reward": 1.0, "step": 388 }, { "all_correct": 0.28125, "all_wrong": 0.125, "completion_length": 87.83203125, "epoch": 0.7353497164461248, "grad_norm": 2.450602394222447, "kl": 0.0419921875, "learning_rate": 3.2617528484603574e-07, "loss": 0.0017, "reward": 1.6159805059432983, "reward_std": 0.22847887873649597, "rewards/accuracy_reward": 0.6159805059432983, "rewards/format_reward": 1.0, "step": 389 }, { "all_correct": 0.40625, "all_wrong": 0.125, "completion_length": 93.2890625, "epoch": 0.7372400756143668, "grad_norm": 1.5610935075912176, "kl": 0.043212890625, "learning_rate": 3.217991112459296e-07, "loss": 0.0017, "reward": 1.614638328552246, "reward_std": 0.19237719476222992, "rewards/accuracy_reward": 0.6146383285522461, "rewards/format_reward": 1.0, "step": 390 }, { "all_correct": 0.40625, "all_wrong": 0.09375, "completion_length": 89.69921875, "epoch": 0.7391304347826086, "grad_norm": 1.6750006945933873, "kl": 0.041259765625, "learning_rate": 3.174468567813461e-07, "loss": 0.0017, "reward": 1.703397274017334, "reward_std": 0.19982343912124634, "rewards/accuracy_reward": 0.703397274017334, "rewards/format_reward": 1.0, "step": 391 }, { "all_correct": 0.46875, "all_wrong": 0.1875, "completion_length": 88.98046875, "epoch": 0.7410207939508506, "grad_norm": 1.5050532217987163, "kl": 0.040771484375, "learning_rate": 3.131186749498195e-07, "loss": 0.0016, "reward": 1.6249730587005615, "reward_std": 0.1348705291748047, "rewards/accuracy_reward": 0.6249730587005615, "rewards/format_reward": 1.0, "step": 392 }, { "all_correct": 0.40625, "all_wrong": 0.1875, "completion_length": 86.80078125, "epoch": 0.7429111531190926, "grad_norm": 2.2056728949358497, "kl": 0.047607421875, "learning_rate": 3.0881471839987815e-07, "loss": 0.0019, "reward": 1.6306958198547363, "reward_std": 0.15101388096809387, "rewards/accuracy_reward": 0.6306958198547363, "rewards/format_reward": 1.0, "step": 393 }, { "all_correct": 0.3125, "all_wrong": 0.21875, "completion_length": 91.85546875, "epoch": 0.7448015122873346, "grad_norm": 2.1315080678559024, "kl": 0.044921875, "learning_rate": 3.0453513892566195e-07, "loss": 0.0018, "reward": 1.544640302658081, "reward_std": 0.19710536301136017, "rewards/accuracy_reward": 0.5446402430534363, "rewards/format_reward": 1.0, "step": 394 }, { "all_correct": 0.40625, "all_wrong": 0.1875, "completion_length": 85.7109375, "epoch": 0.7466918714555766, "grad_norm": 2.06732415423423, "kl": 0.04931640625, "learning_rate": 3.0028008746156587e-07, "loss": 0.002, "reward": 1.5586893558502197, "reward_std": 0.15845312178134918, "rewards/accuracy_reward": 0.5586893558502197, "rewards/format_reward": 1.0, "step": 395 }, { "all_correct": 0.5, "all_wrong": 0.09375, "completion_length": 93.92578125, "epoch": 0.7485822306238186, "grad_norm": 1.8815000905698593, "kl": 0.046630859375, "learning_rate": 2.9604971407692026e-07, "loss": 0.0019, "reward": 1.697596788406372, "reward_std": 0.13653597235679626, "rewards/accuracy_reward": 0.6975967288017273, "rewards/format_reward": 1.0, "step": 396 }, { "all_correct": 0.21875, "all_wrong": 0.375, "completion_length": 93.3359375, "epoch": 0.7504725897920604, "grad_norm": 1.5391602816018224, "kl": 0.041015625, "learning_rate": 2.918441679706949e-07, "loss": 0.0016, "reward": 1.386269211769104, "reward_std": 0.18272624909877777, "rewards/accuracy_reward": 0.394081711769104, "rewards/format_reward": 0.9921875, "step": 397 }, { "all_correct": 0.3125, "all_wrong": 0.28125, "completion_length": 90.81640625, "epoch": 0.7523629489603024, "grad_norm": 1.1599213261291745, "kl": 0.043701171875, "learning_rate": 2.876635974662389e-07, "loss": 0.0017, "reward": 1.473933458328247, "reward_std": 0.16899192333221436, "rewards/accuracy_reward": 0.47783973813056946, "rewards/format_reward": 0.99609375, "step": 398 }, { "all_correct": 0.28125, "all_wrong": 0.21875, "completion_length": 90.78515625, "epoch": 0.7542533081285444, "grad_norm": 1.8839648182094828, "kl": 0.041748046875, "learning_rate": 2.8350815000604976e-07, "loss": 0.0017, "reward": 1.55859375, "reward_std": 0.23474985361099243, "rewards/accuracy_reward": 0.5703125, "rewards/format_reward": 0.98828125, "step": 399 }, { "all_correct": 0.4375, "all_wrong": 0.1875, "completion_length": 93.2578125, "epoch": 0.7561436672967864, "grad_norm": 1.4293529083675727, "kl": 0.04541015625, "learning_rate": 2.7937797214657143e-07, "loss": 0.0018, "reward": 1.5872396230697632, "reward_std": 0.1798129379749298, "rewards/accuracy_reward": 0.6106771230697632, "rewards/format_reward": 0.9765625, "step": 400 }, { "all_correct": 0.28125, "all_wrong": 0.25, "completion_length": 86.91796875, "epoch": 0.7580340264650284, "grad_norm": 2.026608250956968, "kl": 0.04541015625, "learning_rate": 2.752732095530279e-07, "loss": 0.0018, "reward": 1.4875531196594238, "reward_std": 0.18699489533901215, "rewards/accuracy_reward": 0.48755308985710144, "rewards/format_reward": 1.0, "step": 401 }, { "all_correct": 0.53125, "all_wrong": 0.15625, "completion_length": 99.1640625, "epoch": 0.7599243856332704, "grad_norm": 1.5439354460633543, "kl": 0.04443359375, "learning_rate": 2.711940069942833e-07, "loss": 0.0018, "reward": 1.6947214603424072, "reward_std": 0.1390814185142517, "rewards/accuracy_reward": 0.7181590795516968, "rewards/format_reward": 0.9765625, "step": 402 }, { "all_correct": 0.375, "all_wrong": 0.09375, "completion_length": 96.125, "epoch": 0.7618147448015122, "grad_norm": 1.9024078180465291, "kl": 0.039794921875, "learning_rate": 2.671405083377386e-07, "loss": 0.0016, "reward": 1.6169142723083496, "reward_std": 0.19913235306739807, "rewards/accuracy_reward": 0.6169142723083496, "rewards/format_reward": 1.0, "step": 403 }, { "all_correct": 0.34375, "all_wrong": 0.09375, "completion_length": 94.4921875, "epoch": 0.7637051039697542, "grad_norm": 2.8962041644930108, "kl": 0.042724609375, "learning_rate": 2.6311285654425574e-07, "loss": 0.0017, "reward": 1.6525933742523193, "reward_std": 0.25046759843826294, "rewards/accuracy_reward": 0.6525933146476746, "rewards/format_reward": 1.0, "step": 404 }, { "all_correct": 0.3125, "all_wrong": 0.25, "completion_length": 87.58203125, "epoch": 0.7655954631379962, "grad_norm": 1.4168894497370028, "kl": 0.0419921875, "learning_rate": 2.59111193663116e-07, "loss": 0.0017, "reward": 1.505859375, "reward_std": 0.17382082343101501, "rewards/accuracy_reward": 0.505859375, "rewards/format_reward": 1.0, "step": 405 }, { "all_correct": 0.40625, "all_wrong": 0.15625, "completion_length": 90.953125, "epoch": 0.7674858223062382, "grad_norm": 2.400477941677905, "kl": 0.044677734375, "learning_rate": 2.5513566082701134e-07, "loss": 0.0018, "reward": 1.6569660902023315, "reward_std": 0.14563217759132385, "rewards/accuracy_reward": 0.6569661498069763, "rewards/format_reward": 1.0, "step": 406 }, { "all_correct": 0.40625, "all_wrong": 0.1875, "completion_length": 94.703125, "epoch": 0.7693761814744802, "grad_norm": 1.2607140346650068, "kl": 0.03515625, "learning_rate": 2.51186398247065e-07, "loss": 0.0014, "reward": 1.6192121505737305, "reward_std": 0.11734330654144287, "rewards/accuracy_reward": 0.6192121505737305, "rewards/format_reward": 1.0, "step": 407 }, { "all_correct": 0.28125, "all_wrong": 0.1875, "completion_length": 91.6015625, "epoch": 0.7712665406427222, "grad_norm": 3.7794955364804887, "kl": 0.044677734375, "learning_rate": 2.472635452078883e-07, "loss": 0.0018, "reward": 1.5780255794525146, "reward_std": 0.1755901575088501, "rewards/accuracy_reward": 0.5780255794525146, "rewards/format_reward": 1.0, "step": 408 }, { "all_correct": 0.40625, "all_wrong": 0.25, "completion_length": 89.140625, "epoch": 0.7731568998109641, "grad_norm": 1.6217685852043846, "kl": 0.0400390625, "learning_rate": 2.433672400626663e-07, "loss": 0.0016, "reward": 1.6167256832122803, "reward_std": 0.1386527717113495, "rewards/accuracy_reward": 0.616725742816925, "rewards/format_reward": 1.0, "step": 409 }, { "all_correct": 0.28125, "all_wrong": 0.21875, "completion_length": 88.8203125, "epoch": 0.775047258979206, "grad_norm": 2.028364703410531, "kl": 0.051025390625, "learning_rate": 2.3949762022828093e-07, "loss": 0.002, "reward": 1.5439236164093018, "reward_std": 0.22322307527065277, "rewards/accuracy_reward": 0.5439236164093018, "rewards/format_reward": 1.0, "step": 410 }, { "all_correct": 0.28125, "all_wrong": 0.1875, "completion_length": 92.33203125, "epoch": 0.776937618147448, "grad_norm": 2.2480672974329035, "kl": 0.04931640625, "learning_rate": 2.3565482218046073e-07, "loss": 0.002, "reward": 1.567735195159912, "reward_std": 0.2084602564573288, "rewards/accuracy_reward": 0.5677351355552673, "rewards/format_reward": 1.0, "step": 411 }, { "all_correct": 0.5625, "all_wrong": 0.0625, "completion_length": 94.765625, "epoch": 0.77882797731569, "grad_norm": 1.8397495365435537, "kl": 0.04248046875, "learning_rate": 2.3183898144897175e-07, "loss": 0.0017, "reward": 1.7432655096054077, "reward_std": 0.177871972322464, "rewards/accuracy_reward": 0.7432655096054077, "rewards/format_reward": 1.0, "step": 412 }, { "all_correct": 0.1875, "all_wrong": 0.25, "completion_length": 90.35546875, "epoch": 0.780718336483932, "grad_norm": 2.4627560536575497, "kl": 0.0458984375, "learning_rate": 2.2805023261283496e-07, "loss": 0.0018, "reward": 1.4680068492889404, "reward_std": 0.2125786542892456, "rewards/accuracy_reward": 0.46800681948661804, "rewards/format_reward": 1.0, "step": 413 }, { "all_correct": 0.15625, "all_wrong": 0.21875, "completion_length": 96.5859375, "epoch": 0.782608695652174, "grad_norm": 21.851694077321632, "kl": 0.042724609375, "learning_rate": 2.2428870929558007e-07, "loss": 0.0017, "reward": 1.4453372955322266, "reward_std": 0.26194411516189575, "rewards/accuracy_reward": 0.4453372359275818, "rewards/format_reward": 1.0, "step": 414 }, { "all_correct": 0.28125, "all_wrong": 0.3125, "completion_length": 95.6015625, "epoch": 0.7844990548204159, "grad_norm": 2.1610667049451417, "kl": 0.04833984375, "learning_rate": 2.205545441605342e-07, "loss": 0.0019, "reward": 1.4564586877822876, "reward_std": 0.1753809005022049, "rewards/accuracy_reward": 0.46427121758461, "rewards/format_reward": 0.9921875, "step": 415 }, { "all_correct": 0.34375, "all_wrong": 0.21875, "completion_length": 91.46875, "epoch": 0.7863894139886578, "grad_norm": 10.481690705107473, "kl": 0.04443359375, "learning_rate": 2.1684786890614127e-07, "loss": 0.0018, "reward": 1.5876367092132568, "reward_std": 0.17977751791477203, "rewards/accuracy_reward": 0.5876367092132568, "rewards/format_reward": 1.0, "step": 416 }, { "all_correct": 0.34375, "all_wrong": 0.125, "completion_length": 94.65625, "epoch": 0.7882797731568998, "grad_norm": 2.8247468950858647, "kl": 0.039306640625, "learning_rate": 2.1316881426131827e-07, "loss": 0.0016, "reward": 1.6735260486602783, "reward_std": 0.17629718780517578, "rewards/accuracy_reward": 0.6735259890556335, "rewards/format_reward": 1.0, "step": 417 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 95.9765625, "epoch": 0.7901701323251418, "grad_norm": 1.9238663712431303, "kl": 0.041259765625, "learning_rate": 2.0951750998084438e-07, "loss": 0.0016, "reward": 1.5885775089263916, "reward_std": 0.2954632043838501, "rewards/accuracy_reward": 0.6002963781356812, "rewards/format_reward": 0.98828125, "step": 418 }, { "all_correct": 0.28125, "all_wrong": 0.1875, "completion_length": 92.58203125, "epoch": 0.7920604914933838, "grad_norm": 2.082634838675884, "kl": 0.04296875, "learning_rate": 2.058940848407854e-07, "loss": 0.0017, "reward": 1.5185022354125977, "reward_std": 0.21989545226097107, "rewards/accuracy_reward": 0.5185022950172424, "rewards/format_reward": 1.0, "step": 419 }, { "all_correct": 0.34375, "all_wrong": 0.15625, "completion_length": 95.96484375, "epoch": 0.7939508506616257, "grad_norm": 1.4637336629082067, "kl": 0.038818359375, "learning_rate": 2.0229866663395023e-07, "loss": 0.0016, "reward": 1.5763494968414307, "reward_std": 0.22779253125190735, "rewards/accuracy_reward": 0.5919744372367859, "rewards/format_reward": 0.984375, "step": 420 }, { "all_correct": 0.4375, "all_wrong": 0.1875, "completion_length": 94.42578125, "epoch": 0.7958412098298677, "grad_norm": 1.3544480164552257, "kl": 0.0419921875, "learning_rate": 1.9873138216538609e-07, "loss": 0.0017, "reward": 1.6315104961395264, "reward_std": 0.15740279853343964, "rewards/accuracy_reward": 0.6354166865348816, "rewards/format_reward": 0.99609375, "step": 421 }, { "all_correct": 0.25, "all_wrong": 0.28125, "completion_length": 91.765625, "epoch": 0.7977315689981096, "grad_norm": 9.201439231037005, "kl": 0.042724609375, "learning_rate": 1.951923572479044e-07, "loss": 0.0017, "reward": 1.479612112045288, "reward_std": 0.19981667399406433, "rewards/accuracy_reward": 0.4796121120452881, "rewards/format_reward": 1.0, "step": 422 }, { "all_correct": 0.28125, "all_wrong": 0.1875, "completion_length": 94.84765625, "epoch": 0.7996219281663516, "grad_norm": 1.4237426067896413, "kl": 0.048583984375, "learning_rate": 1.916817166976441e-07, "loss": 0.0019, "reward": 1.5277478694915771, "reward_std": 0.2083957940340042, "rewards/accuracy_reward": 0.5355602502822876, "rewards/format_reward": 0.9921875, "step": 423 }, { "all_correct": 0.3125, "all_wrong": 0.1875, "completion_length": 91.10546875, "epoch": 0.8015122873345936, "grad_norm": 2.293182503990981, "kl": 0.041259765625, "learning_rate": 1.8819958432967076e-07, "loss": 0.0017, "reward": 1.558853268623352, "reward_std": 0.18910501897335052, "rewards/accuracy_reward": 0.562759518623352, "rewards/format_reward": 0.99609375, "step": 424 }, { "all_correct": 0.40625, "all_wrong": 0.09375, "completion_length": 95.41015625, "epoch": 0.8034026465028355, "grad_norm": 1.5366707245335558, "kl": 0.0419921875, "learning_rate": 1.847460829536075e-07, "loss": 0.0017, "reward": 1.6059564352035522, "reward_std": 0.2242327332496643, "rewards/accuracy_reward": 0.6215814352035522, "rewards/format_reward": 0.984375, "step": 425 }, { "all_correct": 0.375, "all_wrong": 0.21875, "completion_length": 90.1484375, "epoch": 0.8052930056710775, "grad_norm": 1.3312039879338635, "kl": 0.041748046875, "learning_rate": 1.813213343693064e-07, "loss": 0.0017, "reward": 1.5957437753677368, "reward_std": 0.16802389919757843, "rewards/accuracy_reward": 0.595743715763092, "rewards/format_reward": 1.0, "step": 426 }, { "all_correct": 0.34375, "all_wrong": 0.09375, "completion_length": 97.79296875, "epoch": 0.8071833648393195, "grad_norm": 2.9517888821154714, "kl": 0.04248046875, "learning_rate": 1.779254593625501e-07, "loss": 0.0017, "reward": 1.6021231412887573, "reward_std": 0.22704648971557617, "rewards/accuracy_reward": 0.6021231412887573, "rewards/format_reward": 1.0, "step": 427 }, { "all_correct": 0.375, "all_wrong": 0.25, "completion_length": 92.49609375, "epoch": 0.8090737240075614, "grad_norm": 1.741276036032744, "kl": 0.048095703125, "learning_rate": 1.745585777007943e-07, "loss": 0.0019, "reward": 1.5888817310333252, "reward_std": 0.09943927079439163, "rewards/accuracy_reward": 0.5888816714286804, "rewards/format_reward": 1.0, "step": 428 }, { "all_correct": 0.4375, "all_wrong": 0.09375, "completion_length": 95.453125, "epoch": 0.8109640831758034, "grad_norm": 1.5867733017615009, "kl": 0.03955078125, "learning_rate": 1.7122080812894146e-07, "loss": 0.0016, "reward": 1.6287798881530762, "reward_std": 0.19112670421600342, "rewards/accuracy_reward": 0.6287798881530762, "rewards/format_reward": 1.0, "step": 429 }, { "all_correct": 0.21875, "all_wrong": 0.15625, "completion_length": 97.41796875, "epoch": 0.8128544423440454, "grad_norm": 1.8869110513185203, "kl": 0.04296875, "learning_rate": 1.679122683651546e-07, "loss": 0.0017, "reward": 1.5692415237426758, "reward_std": 0.24745473265647888, "rewards/accuracy_reward": 0.569241464138031, "rewards/format_reward": 1.0, "step": 430 }, { "all_correct": 0.3125, "all_wrong": 0.15625, "completion_length": 96.97265625, "epoch": 0.8147448015122873, "grad_norm": 2.0763617955307274, "kl": 0.0478515625, "learning_rate": 1.6463307509670522e-07, "loss": 0.0019, "reward": 1.5514062643051147, "reward_std": 0.21106550097465515, "rewards/accuracy_reward": 0.5514062643051147, "rewards/format_reward": 1.0, "step": 431 }, { "all_correct": 0.34375, "all_wrong": 0.21875, "completion_length": 96.08984375, "epoch": 0.8166351606805293, "grad_norm": 4.650342860685201, "kl": 0.044189453125, "learning_rate": 1.6138334397585674e-07, "loss": 0.0018, "reward": 1.5383667945861816, "reward_std": 0.166859969496727, "rewards/accuracy_reward": 0.5500854849815369, "rewards/format_reward": 0.98828125, "step": 432 }, { "all_correct": 0.3125, "all_wrong": 0.25, "completion_length": 90.95703125, "epoch": 0.8185255198487713, "grad_norm": 1.7529564775308286, "kl": 0.04150390625, "learning_rate": 1.5816318961578756e-07, "loss": 0.0017, "reward": 1.4878764152526855, "reward_std": 0.18354278802871704, "rewards/accuracy_reward": 0.48787635564804077, "rewards/format_reward": 1.0, "step": 433 }, { "all_correct": 0.25, "all_wrong": 0.21875, "completion_length": 85.59375, "epoch": 0.8204158790170132, "grad_norm": 3.6359061398641677, "kl": 0.048095703125, "learning_rate": 1.5497272558654695e-07, "loss": 0.0019, "reward": 1.4656922817230225, "reward_std": 0.2116546779870987, "rewards/accuracy_reward": 0.46959853172302246, "rewards/format_reward": 0.99609375, "step": 434 }, { "all_correct": 0.40625, "all_wrong": 0.09375, "completion_length": 95.8515625, "epoch": 0.8223062381852552, "grad_norm": 4.974875276682775, "kl": 0.046142578125, "learning_rate": 1.5181206441105077e-07, "loss": 0.0018, "reward": 1.6275532245635986, "reward_std": 0.20301690697669983, "rewards/accuracy_reward": 0.6314594745635986, "rewards/format_reward": 0.99609375, "step": 435 }, { "all_correct": 0.4375, "all_wrong": 0.125, "completion_length": 90.0625, "epoch": 0.8241965973534972, "grad_norm": 2.449754366709296, "kl": 0.04296875, "learning_rate": 1.4868131756111223e-07, "loss": 0.0017, "reward": 1.5798760652542114, "reward_std": 0.2174547016620636, "rewards/accuracy_reward": 0.5994073152542114, "rewards/format_reward": 0.98046875, "step": 436 }, { "all_correct": 0.40625, "all_wrong": 0.125, "completion_length": 99.25390625, "epoch": 0.8260869565217391, "grad_norm": 1.4549018063192884, "kl": 0.047607421875, "learning_rate": 1.4558059545351142e-07, "loss": 0.0019, "reward": 1.5477688312530518, "reward_std": 0.2062843143939972, "rewards/accuracy_reward": 0.5673000812530518, "rewards/format_reward": 0.98046875, "step": 437 }, { "all_correct": 0.40625, "all_wrong": 0.1875, "completion_length": 91.74609375, "epoch": 0.8279773156899811, "grad_norm": 1.2525640014161112, "kl": 0.04248046875, "learning_rate": 1.425100074461003e-07, "loss": 0.0017, "reward": 1.6436383724212646, "reward_std": 0.1697710007429123, "rewards/accuracy_reward": 0.6436384320259094, "rewards/format_reward": 1.0, "step": 438 }, { "all_correct": 0.40625, "all_wrong": 0.09375, "completion_length": 93.125, "epoch": 0.8298676748582231, "grad_norm": 2.3121952823917042, "kl": 0.040283203125, "learning_rate": 1.394696618339456e-07, "loss": 0.0016, "reward": 1.6626487970352173, "reward_std": 0.174302339553833, "rewards/accuracy_reward": 0.6626487970352173, "rewards/format_reward": 1.0, "step": 439 }, { "all_correct": 0.3125, "all_wrong": 0.34375, "completion_length": 93.94140625, "epoch": 0.831758034026465, "grad_norm": 1.3605971037488336, "kl": 0.045166015625, "learning_rate": 1.364596658455105e-07, "loss": 0.0018, "reward": 1.5049912929534912, "reward_std": 0.12267406284809113, "rewards/accuracy_reward": 0.504991352558136, "rewards/format_reward": 1.0, "step": 440 }, { "all_correct": 0.40625, "all_wrong": 0.1875, "completion_length": 91.05859375, "epoch": 0.833648393194707, "grad_norm": 1.4329708155173375, "kl": 0.043701171875, "learning_rate": 1.33480125638871e-07, "loss": 0.0017, "reward": 1.6171175241470337, "reward_std": 0.1537449210882187, "rewards/accuracy_reward": 0.6210237741470337, "rewards/format_reward": 0.99609375, "step": 441 }, { "all_correct": 0.375, "all_wrong": 0.1875, "completion_length": 88.9765625, "epoch": 0.8355387523629489, "grad_norm": 17.688886780647458, "kl": 0.0400390625, "learning_rate": 1.3053114629797435e-07, "loss": 0.0016, "reward": 1.5753612518310547, "reward_std": 0.18671679496765137, "rewards/accuracy_reward": 0.5753612518310547, "rewards/format_reward": 1.0, "step": 442 }, { "all_correct": 0.375, "all_wrong": 0.15625, "completion_length": 98.7578125, "epoch": 0.8374291115311909, "grad_norm": 1.9625832093816682, "kl": 0.045654296875, "learning_rate": 1.2761283182893047e-07, "loss": 0.0018, "reward": 1.5934289693832397, "reward_std": 0.21815939247608185, "rewards/accuracy_reward": 0.5934289693832397, "rewards/format_reward": 1.0, "step": 443 }, { "all_correct": 0.40625, "all_wrong": 0.125, "completion_length": 96.1328125, "epoch": 0.8393194706994329, "grad_norm": 1.580613447705244, "kl": 0.039794921875, "learning_rate": 1.2472528515634585e-07, "loss": 0.0016, "reward": 1.6483333110809326, "reward_std": 0.19740188121795654, "rewards/accuracy_reward": 0.6483333110809326, "rewards/format_reward": 1.0, "step": 444 }, { "all_correct": 0.53125, "all_wrong": 0.1875, "completion_length": 94.78125, "epoch": 0.8412098298676749, "grad_norm": 1.1020475480569663, "kl": 0.03955078125, "learning_rate": 1.2186860811969168e-07, "loss": 0.0016, "reward": 1.702857494354248, "reward_std": 0.13044710457324982, "rewards/accuracy_reward": 0.7028576135635376, "rewards/format_reward": 1.0, "step": 445 }, { "all_correct": 0.375, "all_wrong": 0.125, "completion_length": 94.3515625, "epoch": 0.8431001890359168, "grad_norm": 3.1193628774228603, "kl": 0.048583984375, "learning_rate": 1.1904290146971397e-07, "loss": 0.0019, "reward": 1.6091580390930176, "reward_std": 0.2114795446395874, "rewards/accuracy_reward": 0.6130642294883728, "rewards/format_reward": 0.99609375, "step": 446 }, { "all_correct": 0.5, "all_wrong": 0.15625, "completion_length": 93.82421875, "epoch": 0.8449905482041588, "grad_norm": 1.5499881081679403, "kl": 0.046142578125, "learning_rate": 1.1624826486487872e-07, "loss": 0.0018, "reward": 1.68212890625, "reward_std": 0.1655098795890808, "rewards/accuracy_reward": 0.68994140625, "rewards/format_reward": 0.9921875, "step": 447 }, { "all_correct": 0.46875, "all_wrong": 0.1875, "completion_length": 96.47265625, "epoch": 0.8468809073724007, "grad_norm": 1.3637912127369223, "kl": 0.04248046875, "learning_rate": 1.134847968678575e-07, "loss": 0.0017, "reward": 1.6614583730697632, "reward_std": 0.15649890899658203, "rewards/accuracy_reward": 0.6809896230697632, "rewards/format_reward": 0.98046875, "step": 448 }, { "all_correct": 0.5, "all_wrong": 0.21875, "completion_length": 90.0703125, "epoch": 0.8487712665406427, "grad_norm": 1.1591726049731972, "kl": 0.0390625, "learning_rate": 1.1075259494205225e-07, "loss": 0.0016, "reward": 1.655552625656128, "reward_std": 0.12017819285392761, "rewards/accuracy_reward": 0.6594588756561279, "rewards/format_reward": 0.99609375, "step": 449 }, { "all_correct": 0.40625, "all_wrong": 0.15625, "completion_length": 92.24609375, "epoch": 0.8506616257088847, "grad_norm": 2.039439334103359, "kl": 0.0419921875, "learning_rate": 1.0805175544815648e-07, "loss": 0.0017, "reward": 1.6180661916732788, "reward_std": 0.16808518767356873, "rewards/accuracy_reward": 0.6180662512779236, "rewards/format_reward": 1.0, "step": 450 }, { "all_correct": 0.40625, "all_wrong": 0.09375, "completion_length": 87.765625, "epoch": 0.8525519848771267, "grad_norm": 3.551379163758167, "kl": 0.048095703125, "learning_rate": 1.0538237364075786e-07, "loss": 0.0019, "reward": 1.7047264575958252, "reward_std": 0.17995205521583557, "rewards/accuracy_reward": 0.7047264575958252, "rewards/format_reward": 1.0, "step": 451 }, { "all_correct": 0.3125, "all_wrong": 0.1875, "completion_length": 94.16796875, "epoch": 0.8544423440453687, "grad_norm": 2.4552903396429553, "kl": 0.04296875, "learning_rate": 1.0274454366497787e-07, "loss": 0.0017, "reward": 1.5992646217346191, "reward_std": 0.19959133863449097, "rewards/accuracy_reward": 0.5992645621299744, "rewards/format_reward": 1.0, "step": 452 }, { "all_correct": 0.34375, "all_wrong": 0.21875, "completion_length": 98.84375, "epoch": 0.8563327032136105, "grad_norm": 1.9864652083005676, "kl": 0.04296875, "learning_rate": 1.0013835855315233e-07, "loss": 0.0017, "reward": 1.547175407409668, "reward_std": 0.18141832947731018, "rewards/accuracy_reward": 0.5471754670143127, "rewards/format_reward": 1.0, "step": 453 }, { "all_correct": 0.375, "all_wrong": 0.15625, "completion_length": 91.21484375, "epoch": 0.8582230623818525, "grad_norm": 1.4554139396903383, "kl": 0.04345703125, "learning_rate": 9.756391022154953e-08, "loss": 0.0017, "reward": 1.6482672691345215, "reward_std": 0.20143428444862366, "rewards/accuracy_reward": 0.6482672095298767, "rewards/format_reward": 1.0, "step": 454 }, { "all_correct": 0.40625, "all_wrong": 0.21875, "completion_length": 91.6875, "epoch": 0.8601134215500945, "grad_norm": 1.3848840834631115, "kl": 0.05029296875, "learning_rate": 9.502128946712862e-08, "loss": 0.002, "reward": 1.5533459186553955, "reward_std": 0.1582801640033722, "rewards/accuracy_reward": 0.5572521090507507, "rewards/format_reward": 0.99609375, "step": 455 }, { "all_correct": 0.40625, "all_wrong": 0.28125, "completion_length": 87.30078125, "epoch": 0.8620037807183365, "grad_norm": 1.474494768169295, "kl": 0.0390625, "learning_rate": 9.251058596433792e-08, "loss": 0.0016, "reward": 1.566476821899414, "reward_std": 0.08095038682222366, "rewards/accuracy_reward": 0.5664768218994141, "rewards/format_reward": 1.0, "step": 456 }, { "all_correct": 0.40625, "all_wrong": 0.0625, "completion_length": 92.46484375, "epoch": 0.8638941398865785, "grad_norm": 2.228512668796117, "kl": 0.04150390625, "learning_rate": 9.003188826195141e-08, "loss": 0.0017, "reward": 1.6768805980682373, "reward_std": 0.21106885373592377, "rewards/accuracy_reward": 0.6768805384635925, "rewards/format_reward": 1.0, "step": 457 }, { "all_correct": 0.28125, "all_wrong": 0.25, "completion_length": 89.76953125, "epoch": 0.8657844990548205, "grad_norm": 2.381172656339536, "kl": 0.043701171875, "learning_rate": 8.758528377994667e-08, "loss": 0.0017, "reward": 1.5411908626556396, "reward_std": 0.182044118642807, "rewards/accuracy_reward": 0.5411908626556396, "rewards/format_reward": 1.0, "step": 458 }, { "all_correct": 0.46875, "all_wrong": 0.21875, "completion_length": 97.53125, "epoch": 0.8676748582230623, "grad_norm": 1.1370034333672576, "kl": 0.042724609375, "learning_rate": 8.51708588064206e-08, "loss": 0.0017, "reward": 1.5652220249176025, "reward_std": 0.13882781565189362, "rewards/accuracy_reward": 0.5730345249176025, "rewards/format_reward": 0.9921875, "step": 459 }, { "all_correct": 0.25, "all_wrong": 0.28125, "completion_length": 99.2421875, "epoch": 0.8695652173913043, "grad_norm": 1.9476181687483543, "kl": 0.041259765625, "learning_rate": 8.278869849454717e-08, "loss": 0.0017, "reward": 1.478024959564209, "reward_std": 0.17786462604999542, "rewards/accuracy_reward": 0.47802501916885376, "rewards/format_reward": 1.0, "step": 460 }, { "all_correct": 0.375, "all_wrong": 0.1875, "completion_length": 100.20703125, "epoch": 0.8714555765595463, "grad_norm": 1.224210562302336, "kl": 0.0380859375, "learning_rate": 8.043888685957312e-08, "loss": 0.0015, "reward": 1.5925004482269287, "reward_std": 0.14712047576904297, "rewards/accuracy_reward": 0.5925004482269287, "rewards/format_reward": 1.0, "step": 461 }, { "all_correct": 0.40625, "all_wrong": 0.15625, "completion_length": 91.64453125, "epoch": 0.8733459357277883, "grad_norm": 4.6192400629488395, "kl": 0.045654296875, "learning_rate": 7.812150677585671e-08, "loss": 0.0018, "reward": 1.5704599618911743, "reward_std": 0.1768045723438263, "rewards/accuracy_reward": 0.5743662118911743, "rewards/format_reward": 0.99609375, "step": 462 }, { "all_correct": 0.375, "all_wrong": 0.28125, "completion_length": 94.3828125, "epoch": 0.8752362948960303, "grad_norm": 1.9953801863403324, "kl": 0.049560546875, "learning_rate": 7.58366399739424e-08, "loss": 0.002, "reward": 1.5050649642944336, "reward_std": 0.11211474239826202, "rewards/accuracy_reward": 0.5089712142944336, "rewards/format_reward": 0.99609375, "step": 463 }, { "all_correct": 0.4375, "all_wrong": 0.125, "completion_length": 90.1640625, "epoch": 0.8771266540642723, "grad_norm": 1.8235799095393221, "kl": 0.043212890625, "learning_rate": 7.358436703768034e-08, "loss": 0.0017, "reward": 1.589560627937317, "reward_std": 0.16465333104133606, "rewards/accuracy_reward": 0.5895605683326721, "rewards/format_reward": 1.0, "step": 464 }, { "all_correct": 0.4375, "all_wrong": 0.21875, "completion_length": 95.63671875, "epoch": 0.8790170132325141, "grad_norm": 1.0760041562687386, "kl": 0.0380859375, "learning_rate": 7.136476740138387e-08, "loss": 0.0015, "reward": 1.5880682468414307, "reward_std": 0.15734529495239258, "rewards/accuracy_reward": 0.6154119968414307, "rewards/format_reward": 0.97265625, "step": 465 }, { "all_correct": 0.25, "all_wrong": 0.25, "completion_length": 101.12890625, "epoch": 0.8809073724007561, "grad_norm": 1.4738314342767895, "kl": 0.044189453125, "learning_rate": 6.917791934702655e-08, "loss": 0.0018, "reward": 1.4817919731140137, "reward_std": 0.2077905535697937, "rewards/accuracy_reward": 0.48179197311401367, "rewards/format_reward": 1.0, "step": 466 }, { "all_correct": 0.53125, "all_wrong": 0.15625, "completion_length": 95.58203125, "epoch": 0.8827977315689981, "grad_norm": 1.0571716079445075, "kl": 0.04296875, "learning_rate": 6.70239000014835e-08, "loss": 0.0017, "reward": 1.6927690505981445, "reward_std": 0.14029529690742493, "rewards/accuracy_reward": 0.6966753005981445, "rewards/format_reward": 0.99609375, "step": 467 }, { "all_correct": 0.3125, "all_wrong": 0.21875, "completion_length": 97.92578125, "epoch": 0.8846880907372401, "grad_norm": 1.5234291263564896, "kl": 0.0458984375, "learning_rate": 6.490278533380955e-08, "loss": 0.0018, "reward": 1.540100336074829, "reward_std": 0.2111871838569641, "rewards/accuracy_reward": 0.5635378360748291, "rewards/format_reward": 0.9765625, "step": 468 }, { "all_correct": 0.375, "all_wrong": 0.1875, "completion_length": 83.59375, "epoch": 0.8865784499054821, "grad_norm": 2.0255739847362766, "kl": 0.049560546875, "learning_rate": 6.281465015256093e-08, "loss": 0.002, "reward": 1.5604361295700073, "reward_std": 0.17061945796012878, "rewards/accuracy_reward": 0.5604361891746521, "rewards/format_reward": 1.0, "step": 469 }, { "all_correct": 0.40625, "all_wrong": 0.1875, "completion_length": 103.4921875, "epoch": 0.888468809073724, "grad_norm": 1.4503561746862432, "kl": 0.044921875, "learning_rate": 6.075956810315619e-08, "loss": 0.0018, "reward": 1.599052906036377, "reward_std": 0.1590302437543869, "rewards/accuracy_reward": 0.5990527868270874, "rewards/format_reward": 1.0, "step": 470 }, { "all_correct": 0.4375, "all_wrong": 0.15625, "completion_length": 85.69921875, "epoch": 0.8903591682419659, "grad_norm": 1.9843134997693552, "kl": 0.052001953125, "learning_rate": 5.8737611665279355e-08, "loss": 0.0021, "reward": 1.624929666519165, "reward_std": 0.15200699865818024, "rewards/accuracy_reward": 0.624929666519165, "rewards/format_reward": 1.0, "step": 471 }, { "all_correct": 0.25, "all_wrong": 0.125, "completion_length": 95.69921875, "epoch": 0.8922495274102079, "grad_norm": 1.9277786145166012, "kl": 0.0439453125, "learning_rate": 5.6748852150323215e-08, "loss": 0.0018, "reward": 1.5506601333618164, "reward_std": 0.21757060289382935, "rewards/accuracy_reward": 0.5506601333618164, "rewards/format_reward": 1.0, "step": 472 }, { "all_correct": 0.28125, "all_wrong": 0.25, "completion_length": 94.9140625, "epoch": 0.8941398865784499, "grad_norm": 1.3171715987687558, "kl": 0.04150390625, "learning_rate": 5.479335969887466e-08, "loss": 0.0017, "reward": 1.486616611480713, "reward_std": 0.17591437697410583, "rewards/accuracy_reward": 0.49052292108535767, "rewards/format_reward": 0.99609375, "step": 473 }, { "all_correct": 0.34375, "all_wrong": 0.34375, "completion_length": 92.6640625, "epoch": 0.8960302457466919, "grad_norm": 1.1864505239231775, "kl": 0.05126953125, "learning_rate": 5.2871203278240906e-08, "loss": 0.002, "reward": 1.528516173362732, "reward_std": 0.11142821609973907, "rewards/accuracy_reward": 0.5285161733627319, "rewards/format_reward": 1.0, "step": 474 }, { "all_correct": 0.28125, "all_wrong": 0.1875, "completion_length": 99.765625, "epoch": 0.8979206049149339, "grad_norm": 3.793748033276946, "kl": 0.0390625, "learning_rate": 5.098245068001661e-08, "loss": 0.0016, "reward": 1.4771013259887695, "reward_std": 0.22667381167411804, "rewards/accuracy_reward": 0.47710129618644714, "rewards/format_reward": 1.0, "step": 475 }, { "all_correct": 0.40625, "all_wrong": 0.1875, "completion_length": 83.51953125, "epoch": 0.8998109640831758, "grad_norm": 6.850505412934267, "kl": 0.044921875, "learning_rate": 4.912716851769394e-08, "loss": 0.0018, "reward": 1.649026870727539, "reward_std": 0.15467888116836548, "rewards/accuracy_reward": 0.6490268707275391, "rewards/format_reward": 1.0, "step": 476 }, { "all_correct": 0.34375, "all_wrong": 0.1875, "completion_length": 87.7109375, "epoch": 0.9017013232514177, "grad_norm": 3.519242696315356, "kl": 0.044677734375, "learning_rate": 4.730542222431222e-08, "loss": 0.0018, "reward": 1.6143535375595093, "reward_std": 0.17040878534317017, "rewards/accuracy_reward": 0.6143535375595093, "rewards/format_reward": 1.0, "step": 477 }, { "all_correct": 0.34375, "all_wrong": 0.1875, "completion_length": 90.99609375, "epoch": 0.9035916824196597, "grad_norm": 3.880434261673006, "kl": 0.038818359375, "learning_rate": 4.5517276050150325e-08, "loss": 0.0016, "reward": 1.5734894275665283, "reward_std": 0.16900530457496643, "rewards/accuracy_reward": 0.5734893679618835, "rewards/format_reward": 1.0, "step": 478 }, { "all_correct": 0.4375, "all_wrong": 0.0625, "completion_length": 90.35546875, "epoch": 0.9054820415879017, "grad_norm": 1.8707086039575145, "kl": 0.041015625, "learning_rate": 4.3762793060461824e-08, "loss": 0.0016, "reward": 1.6607571840286255, "reward_std": 0.23725268244743347, "rewards/accuracy_reward": 0.6763821840286255, "rewards/format_reward": 0.984375, "step": 479 }, { "all_correct": 0.25, "all_wrong": 0.1875, "completion_length": 87.125, "epoch": 0.9073724007561437, "grad_norm": 2.978657513471123, "kl": 0.048583984375, "learning_rate": 4.2042035133248885e-08, "loss": 0.0019, "reward": 1.4891417026519775, "reward_std": 0.2210281491279602, "rewards/accuracy_reward": 0.48914170265197754, "rewards/format_reward": 1.0, "step": 480 }, { "all_correct": 0.21875, "all_wrong": 0.21875, "completion_length": 88.83984375, "epoch": 0.9092627599243857, "grad_norm": 1.9106697468452067, "kl": 0.0458984375, "learning_rate": 4.035506295708191e-08, "loss": 0.0018, "reward": 1.4589961767196655, "reward_std": 0.21063633263111115, "rewards/accuracy_reward": 0.46290236711502075, "rewards/format_reward": 0.99609375, "step": 481 }, { "all_correct": 0.40625, "all_wrong": 0.15625, "completion_length": 95.41796875, "epoch": 0.9111531190926276, "grad_norm": 3.468786099498894, "kl": 0.046875, "learning_rate": 3.870193602895733e-08, "loss": 0.0019, "reward": 1.6448715925216675, "reward_std": 0.16561608016490936, "rewards/accuracy_reward": 0.6448715329170227, "rewards/format_reward": 1.0, "step": 482 }, { "all_correct": 0.46875, "all_wrong": 0.15625, "completion_length": 96.2109375, "epoch": 0.9130434782608695, "grad_norm": 2.104789227248534, "kl": 0.0361328125, "learning_rate": 3.708271265220087e-08, "loss": 0.0015, "reward": 1.6773532629013062, "reward_std": 0.16596126556396484, "rewards/accuracy_reward": 0.7046970725059509, "rewards/format_reward": 0.97265625, "step": 483 }, { "all_correct": 0.3125, "all_wrong": 0.28125, "completion_length": 89.72265625, "epoch": 0.9149338374291115, "grad_norm": 6.692599057258857, "kl": 0.042236328125, "learning_rate": 3.5497449934409396e-08, "loss": 0.0017, "reward": 1.45703125, "reward_std": 0.19940373301506042, "rewards/accuracy_reward": 0.46484375, "rewards/format_reward": 0.9921875, "step": 484 }, { "all_correct": 0.5, "all_wrong": 0.15625, "completion_length": 89.53515625, "epoch": 0.9168241965973535, "grad_norm": 1.8841313081503308, "kl": 0.05078125, "learning_rate": 3.394620378543911e-08, "loss": 0.002, "reward": 1.6709468364715576, "reward_std": 0.13965073227882385, "rewards/accuracy_reward": 0.6826655268669128, "rewards/format_reward": 0.98828125, "step": 485 }, { "all_correct": 0.40625, "all_wrong": 0.25, "completion_length": 86.93359375, "epoch": 0.9187145557655955, "grad_norm": 1.673566002692786, "kl": 0.046142578125, "learning_rate": 3.2429028915431534e-08, "loss": 0.0018, "reward": 1.5806677341461182, "reward_std": 0.13359013199806213, "rewards/accuracy_reward": 0.5806676149368286, "rewards/format_reward": 1.0, "step": 486 }, { "all_correct": 0.34375, "all_wrong": 0.1875, "completion_length": 88.890625, "epoch": 0.9206049149338374, "grad_norm": 4.461429054725898, "kl": 0.041259765625, "learning_rate": 3.094597883288574e-08, "loss": 0.0016, "reward": 1.56640625, "reward_std": 0.2223491370677948, "rewards/accuracy_reward": 0.5703125, "rewards/format_reward": 0.99609375, "step": 487 }, { "all_correct": 0.34375, "all_wrong": 0.09375, "completion_length": 99.546875, "epoch": 0.9224952741020794, "grad_norm": 2.074395570322666, "kl": 0.04638671875, "learning_rate": 2.9497105842769433e-08, "loss": 0.0019, "reward": 1.6248832941055298, "reward_std": 0.2539004683494568, "rewards/accuracy_reward": 0.636601984500885, "rewards/format_reward": 0.98828125, "step": 488 }, { "all_correct": 0.34375, "all_wrong": 0.25, "completion_length": 84.6328125, "epoch": 0.9243856332703214, "grad_norm": 1.4101059267718767, "kl": 0.04833984375, "learning_rate": 2.808246104467582e-08, "loss": 0.0019, "reward": 1.5731714963912964, "reward_std": 0.1267606019973755, "rewards/accuracy_reward": 0.5731715559959412, "rewards/format_reward": 1.0, "step": 489 }, { "all_correct": 0.34375, "all_wrong": 0.125, "completion_length": 89.83203125, "epoch": 0.9262759924385633, "grad_norm": 2.2978085302274267, "kl": 0.04541015625, "learning_rate": 2.6702094331020886e-08, "loss": 0.0018, "reward": 1.6596397161483765, "reward_std": 0.18694524466991425, "rewards/accuracy_reward": 0.6596397161483765, "rewards/format_reward": 1.0, "step": 490 }, { "all_correct": 0.4375, "all_wrong": 0.28125, "completion_length": 91.2265625, "epoch": 0.9281663516068053, "grad_norm": 1.207942110015228, "kl": 0.044677734375, "learning_rate": 2.5356054385282766e-08, "loss": 0.0018, "reward": 1.611553430557251, "reward_std": 0.09651355445384979, "rewards/accuracy_reward": 0.611553430557251, "rewards/format_reward": 1.0, "step": 491 }, { "all_correct": 0.4375, "all_wrong": 0.0625, "completion_length": 100.76953125, "epoch": 0.9300567107750473, "grad_norm": 1.46194048244851, "kl": 0.039794921875, "learning_rate": 2.4044388680286575e-08, "loss": 0.0016, "reward": 1.69921875, "reward_std": 0.25242602825164795, "rewards/accuracy_reward": 0.7109375, "rewards/format_reward": 0.98828125, "step": 492 }, { "all_correct": 0.4375, "all_wrong": 0.03125, "completion_length": 103.296875, "epoch": 0.9319470699432892, "grad_norm": 1.6142721258162138, "kl": 0.03857421875, "learning_rate": 2.2767143476528306e-08, "loss": 0.0015, "reward": 1.6534472703933716, "reward_std": 0.2080773115158081, "rewards/accuracy_reward": 0.6690722107887268, "rewards/format_reward": 0.984375, "step": 493 }, { "all_correct": 0.4375, "all_wrong": 0.0625, "completion_length": 96.51171875, "epoch": 0.9338374291115312, "grad_norm": 1.5128739509537954, "kl": 0.037841796875, "learning_rate": 2.152436382054479e-08, "loss": 0.0015, "reward": 1.6529420614242554, "reward_std": 0.22081422805786133, "rewards/accuracy_reward": 0.6607545614242554, "rewards/format_reward": 0.9921875, "step": 494 }, { "all_correct": 0.4375, "all_wrong": 0.1875, "completion_length": 88.4765625, "epoch": 0.9357277882797732, "grad_norm": 1.4350308350145478, "kl": 0.05126953125, "learning_rate": 2.0316093543323753e-08, "loss": 0.0021, "reward": 1.5785757303237915, "reward_std": 0.1358921080827713, "rewards/accuracy_reward": 0.5785757303237915, "rewards/format_reward": 1.0, "step": 495 }, { "all_correct": 0.34375, "all_wrong": 0.28125, "completion_length": 91.57421875, "epoch": 0.9376181474480151, "grad_norm": 1.4943511024476006, "kl": 0.042236328125, "learning_rate": 1.914237525875917e-08, "loss": 0.0017, "reward": 1.4834184646606445, "reward_std": 0.1201879009604454, "rewards/accuracy_reward": 0.48341846466064453, "rewards/format_reward": 1.0, "step": 496 }, { "all_correct": 0.5, "all_wrong": 0.1875, "completion_length": 90.48046875, "epoch": 0.9395085066162571, "grad_norm": 1.2851391631205682, "kl": 0.040283203125, "learning_rate": 1.8003250362147004e-08, "loss": 0.0016, "reward": 1.651926040649414, "reward_std": 0.11862440407276154, "rewards/accuracy_reward": 0.6519260406494141, "rewards/format_reward": 1.0, "step": 497 }, { "all_correct": 0.34375, "all_wrong": 0.25, "completion_length": 91.71484375, "epoch": 0.941398865784499, "grad_norm": 3.68427646191117, "kl": 0.04296875, "learning_rate": 1.6898759028726283e-08, "loss": 0.0017, "reward": 1.55078125, "reward_std": 0.19926638901233673, "rewards/accuracy_reward": 0.5546875, "rewards/format_reward": 0.99609375, "step": 498 }, { "all_correct": 0.28125, "all_wrong": 0.25, "completion_length": 88.03515625, "epoch": 0.943289224952741, "grad_norm": 1.9461501149892964, "kl": 0.045654296875, "learning_rate": 1.5828940212261887e-08, "loss": 0.0018, "reward": 1.4915789365768433, "reward_std": 0.1742965131998062, "rewards/accuracy_reward": 0.49157893657684326, "rewards/format_reward": 1.0, "step": 499 }, { "all_correct": 0.3125, "all_wrong": 0.21875, "completion_length": 104.78515625, "epoch": 0.945179584120983, "grad_norm": 1.4451954837871217, "kl": 0.040283203125, "learning_rate": 1.4793831643670429e-08, "loss": 0.0016, "reward": 1.5099118947982788, "reward_std": 0.25256532430648804, "rewards/accuracy_reward": 0.5294432044029236, "rewards/format_reward": 0.98046875, "step": 500 }, { "all_correct": 0.46875, "all_wrong": 0.21875, "completion_length": 100.34765625, "epoch": 0.947069943289225, "grad_norm": 1.6974654768712778, "kl": 0.039794921875, "learning_rate": 1.3793469829689986e-08, "loss": 0.0016, "reward": 1.6564844846725464, "reward_std": 0.11369632184505463, "rewards/accuracy_reward": 0.6564844846725464, "rewards/format_reward": 1.0, "step": 501 }, { "all_correct": 0.34375, "all_wrong": 0.1875, "completion_length": 90.28515625, "epoch": 0.9489603024574669, "grad_norm": 1.725404425317813, "kl": 0.040283203125, "learning_rate": 1.2827890051592127e-08, "loss": 0.0016, "reward": 1.6119554042816162, "reward_std": 0.16685199737548828, "rewards/accuracy_reward": 0.6119554042816162, "rewards/format_reward": 1.0, "step": 502 }, { "all_correct": 0.3125, "all_wrong": 0.21875, "completion_length": 100.02734375, "epoch": 0.9508506616257089, "grad_norm": 1.6905254249447337, "kl": 0.04150390625, "learning_rate": 1.1897126363937803e-08, "loss": 0.0017, "reward": 1.6142412424087524, "reward_std": 0.18831773102283478, "rewards/accuracy_reward": 0.6142412424087524, "rewards/format_reward": 1.0, "step": 503 }, { "all_correct": 0.40625, "all_wrong": 0.15625, "completion_length": 97.78125, "epoch": 0.9527410207939508, "grad_norm": 1.4091412694145533, "kl": 0.0390625, "learning_rate": 1.1001211593376525e-08, "loss": 0.0016, "reward": 1.5735445022583008, "reward_std": 0.17769688367843628, "rewards/accuracy_reward": 0.577450692653656, "rewards/format_reward": 0.99609375, "step": 504 }, { "all_correct": 0.40625, "all_wrong": 0.1875, "completion_length": 81.171875, "epoch": 0.9546313799621928, "grad_norm": 2.0493717412947645, "kl": 0.04443359375, "learning_rate": 1.0140177337488287e-08, "loss": 0.0018, "reward": 1.6359052658081055, "reward_std": 0.15704122185707092, "rewards/accuracy_reward": 0.6359052062034607, "rewards/format_reward": 1.0, "step": 505 }, { "all_correct": 0.1875, "all_wrong": 0.1875, "completion_length": 91.32421875, "epoch": 0.9565217391304348, "grad_norm": 3.7195758010441407, "kl": 0.047607421875, "learning_rate": 9.314053963669244e-09, "loss": 0.0019, "reward": 1.5355010032653809, "reward_std": 0.20620205998420715, "rewards/accuracy_reward": 0.5355010628700256, "rewards/format_reward": 1.0, "step": 506 }, { "all_correct": 0.40625, "all_wrong": 0.125, "completion_length": 93.41796875, "epoch": 0.9584120982986768, "grad_norm": 4.273675771479045, "kl": 0.0419921875, "learning_rate": 8.522870608060562e-09, "loss": 0.0017, "reward": 1.6529306173324585, "reward_std": 0.17892761528491974, "rewards/accuracy_reward": 0.6529306173324585, "rewards/format_reward": 1.0, "step": 507 }, { "all_correct": 0.4375, "all_wrong": 0.15625, "completion_length": 95.71875, "epoch": 0.9603024574669187, "grad_norm": 1.471506691627891, "kl": 0.045166015625, "learning_rate": 7.766655174521464e-09, "loss": 0.0018, "reward": 1.6168668270111084, "reward_std": 0.13944411277770996, "rewards/accuracy_reward": 0.6168668270111084, "rewards/format_reward": 1.0, "step": 508 }, { "all_correct": 0.25, "all_wrong": 0.21875, "completion_length": 94.5, "epoch": 0.9621928166351607, "grad_norm": 3.051208934788439, "kl": 0.042236328125, "learning_rate": 7.045434333643796e-09, "loss": 0.0017, "reward": 1.5761425495147705, "reward_std": 0.22245533764362335, "rewards/accuracy_reward": 0.5761424899101257, "rewards/format_reward": 1.0, "step": 509 }, { "all_correct": 0.28125, "all_wrong": 0.15625, "completion_length": 90.06640625, "epoch": 0.9640831758034026, "grad_norm": 1.9014046704475331, "kl": 0.042724609375, "learning_rate": 6.3592335218132235e-09, "loss": 0.0017, "reward": 1.5352835655212402, "reward_std": 0.22351181507110596, "rewards/accuracy_reward": 0.5352836847305298, "rewards/format_reward": 1.0, "step": 510 } ], "logging_steps": 1.0, "max_steps": 529, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 510, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }