{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 653, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 166.125, "epoch": 0.0015313935681470138, "grad_norm": 14.935008926918243, "kl": 0.0, "learning_rate": 9.999942135453495e-07, "loss": -0.0, "reward": 3.6875, "reward_std": 0.5246413946151733, "rewards/accuracy_reward": 2.3874998092651367, "rewards/format_reward": 1.0, "step": 1, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 221.96875, "epoch": 0.0030627871362940277, "grad_norm": 9.579596700958447, "kl": 0.000728607177734375, "learning_rate": 9.999768543153299e-07, "loss": 0.0, "reward": 3.549999952316284, "reward_std": 0.5580562353134155, "rewards/accuracy_reward": 2.25, "rewards/format_reward": 1.0, "step": 2, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 221.6875, "epoch": 0.004594180704441042, "grad_norm": 10.756952732551833, "kl": 0.00072479248046875, "learning_rate": 9.99947922711735e-07, "loss": 0.0, "reward": 3.4437499046325684, "reward_std": 0.3714633882045746, "rewards/accuracy_reward": 2.1437501907348633, "rewards/format_reward": 1.0, "step": 3, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 244.75, "epoch": 0.006125574272588055, "grad_norm": 8.487559202337772, "kl": 0.0007781982421875, "learning_rate": 9.999074194042105e-07, "loss": 0.0, "reward": 3.0625, "reward_std": 0.4976257085800171, "rewards/accuracy_reward": 1.7625000476837158, "rewards/format_reward": 1.0, "step": 4, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 231.3125, "epoch": 0.007656967840735069, "grad_norm": 4.6294295976229085, "kl": 0.000553131103515625, "learning_rate": 9.998553453302385e-07, "loss": 0.0, "reward": 3.59375, "reward_std": 0.5659699440002441, "rewards/accuracy_reward": 2.2937498092651367, "rewards/format_reward": 1.0, "step": 5, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 198.9375, "epoch": 0.009188361408882083, "grad_norm": 6.183404279543758, "kl": 0.000736236572265625, "learning_rate": 9.997917016951161e-07, "loss": 0.0, "reward": 3.4624998569488525, "reward_std": 0.5215482711791992, "rewards/accuracy_reward": 2.1624999046325684, "rewards/format_reward": 1.0, "step": 6, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 236.15625, "epoch": 0.010719754977029096, "grad_norm": 7.995581870293477, "kl": 0.000652313232421875, "learning_rate": 9.997164899719272e-07, "loss": 0.0, "reward": 3.1812500953674316, "reward_std": 0.6596391201019287, "rewards/accuracy_reward": 1.9562499523162842, "rewards/format_reward": 1.0, "step": 7, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 223.53125, "epoch": 0.01225114854517611, "grad_norm": 5.97513336353613, "kl": 0.0009613037109375, "learning_rate": 9.996297119015088e-07, "loss": 0.0, "reward": 3.0562500953674316, "reward_std": 0.38864773511886597, "rewards/accuracy_reward": 1.7562501430511475, "rewards/format_reward": 1.0, "step": 8, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 193.3125, "epoch": 0.013782542113323124, "grad_norm": 21.108476928401185, "kl": 0.00077056884765625, "learning_rate": 9.995313694924106e-07, "loss": 0.0, "reward": 3.418750047683716, "reward_std": 0.5562530755996704, "rewards/accuracy_reward": 2.1187498569488525, "rewards/format_reward": 1.0, "step": 9, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 226.625, "epoch": 0.015313935681470138, "grad_norm": 13.570920274198468, "kl": 0.00099945068359375, "learning_rate": 9.99421465020848e-07, "loss": 0.0, "reward": 2.5812501907348633, "reward_std": 0.4478328227996826, "rewards/accuracy_reward": 1.3562500476837158, "rewards/format_reward": 1.0, "step": 10, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 209.25, "epoch": 0.016845329249617153, "grad_norm": 7.2683619536034545, "kl": 0.001190185546875, "learning_rate": 9.9930000103065e-07, "loss": 0.0, "reward": 3.0875000953674316, "reward_std": 0.3895391523838043, "rewards/accuracy_reward": 1.787500023841858, "rewards/format_reward": 1.0, "step": 11, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 203.15625, "epoch": 0.018376722817764167, "grad_norm": 6.1372904663686905, "kl": 0.0010833740234375, "learning_rate": 9.991669803331996e-07, "loss": 0.0, "reward": 3.2937498092651367, "reward_std": 0.5000158548355103, "rewards/accuracy_reward": 1.993749976158142, "rewards/format_reward": 1.0, "step": 12, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 210.75, "epoch": 0.019908116385911178, "grad_norm": 12.89544676032573, "kl": 0.0013580322265625, "learning_rate": 9.990224060073705e-07, "loss": 0.0, "reward": 2.84375, "reward_std": 0.4483214318752289, "rewards/accuracy_reward": 1.5437499284744263, "rewards/format_reward": 1.0, "step": 13, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 171.71875, "epoch": 0.021439509954058193, "grad_norm": 4.748966564107283, "kl": 0.0010833740234375, "learning_rate": 9.988662813994532e-07, "loss": 0.0, "reward": 3.2125000953674316, "reward_std": 0.4343854486942291, "rewards/accuracy_reward": 1.9124999046325684, "rewards/format_reward": 1.0, "step": 14, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 216.1875, "epoch": 0.022970903522205207, "grad_norm": 4.904787481992538, "kl": 0.00148773193359375, "learning_rate": 9.9869861012308e-07, "loss": 0.0, "reward": 2.924999952316284, "reward_std": 0.39761877059936523, "rewards/accuracy_reward": 1.625, "rewards/format_reward": 1.0, "step": 15, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 237.59375, "epoch": 0.02450229709035222, "grad_norm": 8.580698497931925, "kl": 0.0013275146484375, "learning_rate": 9.985193960591395e-07, "loss": 0.0, "reward": 2.3187499046325684, "reward_std": 0.21549977362155914, "rewards/accuracy_reward": 1.1687500476837158, "rewards/format_reward": 1.0, "step": 16, "temporal_rewards": 0.5 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 237.59375, "epoch": 0.026033690658499236, "grad_norm": 5.153356819387367, "kl": 0.00135040283203125, "learning_rate": 9.98328643355688e-07, "loss": 0.0, "reward": 3.5249998569488525, "reward_std": 0.46054166555404663, "rewards/accuracy_reward": 2.2249999046325684, "rewards/format_reward": 1.0, "step": 17, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 210.59375, "epoch": 0.027565084226646247, "grad_norm": 10.800790084406668, "kl": 0.001556396484375, "learning_rate": 9.981263564278534e-07, "loss": 0.0, "reward": 3.2437500953674316, "reward_std": 0.31860852241516113, "rewards/accuracy_reward": 1.943750023841858, "rewards/format_reward": 1.0, "step": 18, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 203.4375, "epoch": 0.02909647779479326, "grad_norm": 8.032416154857067, "kl": 0.00142669677734375, "learning_rate": 9.979125399577318e-07, "loss": 0.0, "reward": 3.5812501907348633, "reward_std": 0.592538595199585, "rewards/accuracy_reward": 2.28125, "rewards/format_reward": 1.0, "step": 19, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 216.40625, "epoch": 0.030627871362940276, "grad_norm": 8.373934147836081, "kl": 0.001373291015625, "learning_rate": 9.976871988942804e-07, "loss": 0.0, "reward": 3.6812500953674316, "reward_std": 0.5628457069396973, "rewards/accuracy_reward": 2.3812499046325684, "rewards/format_reward": 1.0, "step": 20, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 188.21875, "epoch": 0.03215926493108729, "grad_norm": 8.031005962405207, "kl": 0.0020599365234375, "learning_rate": 9.974503384532027e-07, "loss": 0.0, "reward": 3.0062499046325684, "reward_std": 0.6598080396652222, "rewards/accuracy_reward": 1.8562499284744263, "rewards/format_reward": 1.0, "step": 21, "temporal_rewards": 0.5 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 233.875, "epoch": 0.033690658499234305, "grad_norm": 7.26237019193215, "kl": 0.00171661376953125, "learning_rate": 9.972019641168275e-07, "loss": 0.0, "reward": 3.6875, "reward_std": 0.6160791516304016, "rewards/accuracy_reward": 2.3874998092651367, "rewards/format_reward": 1.0, "step": 22, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 210.90625, "epoch": 0.03522205206738132, "grad_norm": 22.405745531866504, "kl": 0.001953125, "learning_rate": 9.969420816339821e-07, "loss": 0.0, "reward": 3.2437498569488525, "reward_std": 0.40731462836265564, "rewards/accuracy_reward": 1.943750023841858, "rewards/format_reward": 1.0, "step": 23, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 247.03125, "epoch": 0.036753445635528334, "grad_norm": 5.848799650690625, "kl": 0.0017852783203125, "learning_rate": 9.966706970198596e-07, "loss": 0.0, "reward": 3.5562500953674316, "reward_std": 0.51361083984375, "rewards/accuracy_reward": 2.2562499046325684, "rewards/format_reward": 1.0, "step": 24, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 194.09375, "epoch": 0.03828483920367534, "grad_norm": 7.357989272890045, "kl": 0.0019989013671875, "learning_rate": 9.963878165558785e-07, "loss": 0.0, "reward": 3.987499952316284, "reward_std": 0.3582419753074646, "rewards/accuracy_reward": 2.6875, "rewards/format_reward": 1.0, "step": 25, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 196.65625, "epoch": 0.039816232771822356, "grad_norm": 9.846159640012448, "kl": 0.0020751953125, "learning_rate": 9.960934467895391e-07, "loss": 0.0, "reward": 3.1875, "reward_std": 0.37332883477211, "rewards/accuracy_reward": 1.8875000476837158, "rewards/format_reward": 1.0, "step": 26, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 173.78125, "epoch": 0.04134762633996937, "grad_norm": 11.76376909133194, "kl": 0.00213623046875, "learning_rate": 9.957875945342706e-07, "loss": 0.0, "reward": 3.487499952316284, "reward_std": 0.6070871949195862, "rewards/accuracy_reward": 2.1875, "rewards/format_reward": 1.0, "step": 27, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 195.40625, "epoch": 0.042879019908116385, "grad_norm": 5.532470814281292, "kl": 0.00244140625, "learning_rate": 9.954702668692737e-07, "loss": 0.0, "reward": 3.231250047683716, "reward_std": 0.4620515704154968, "rewards/accuracy_reward": 1.931249976158142, "rewards/format_reward": 1.0, "step": 28, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 199.0, "epoch": 0.0444104134762634, "grad_norm": 5.1077668198943265, "kl": 0.001983642578125, "learning_rate": 9.951414711393568e-07, "loss": 0.0, "reward": 2.8375000953674316, "reward_std": 0.5319792032241821, "rewards/accuracy_reward": 1.6124999523162842, "rewards/format_reward": 1.0, "step": 29, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 196.28125, "epoch": 0.045941807044410414, "grad_norm": 9.955982570369333, "kl": 0.0024566650390625, "learning_rate": 9.948012149547666e-07, "loss": 0.0, "reward": 3.6875, "reward_std": 0.4107499122619629, "rewards/accuracy_reward": 2.387500047683716, "rewards/format_reward": 1.0, "step": 30, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 190.34375, "epoch": 0.04747320061255743, "grad_norm": 14.926260894095993, "kl": 0.0030517578125, "learning_rate": 9.94449506191011e-07, "loss": 0.0, "reward": 3.34375, "reward_std": 0.5044288635253906, "rewards/accuracy_reward": 2.043750047683716, "rewards/format_reward": 1.0, "step": 31, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 230.78125, "epoch": 0.04900459418070444, "grad_norm": 5.519507890472418, "kl": 0.0035400390625, "learning_rate": 9.94086352988677e-07, "loss": 0.0, "reward": 3.4124999046325684, "reward_std": 0.4231412410736084, "rewards/accuracy_reward": 2.112499952316284, "rewards/format_reward": 1.0, "step": 32, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 235.5, "epoch": 0.05053598774885146, "grad_norm": 17.77903651092269, "kl": 0.003143310546875, "learning_rate": 9.937117637532426e-07, "loss": 0.0, "reward": 2.9937500953674316, "reward_std": 0.3739195168018341, "rewards/accuracy_reward": 1.6937501430511475, "rewards/format_reward": 1.0, "step": 33, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 208.59375, "epoch": 0.05206738131699847, "grad_norm": 7.665468500582905, "kl": 0.0030517578125, "learning_rate": 9.933257471548827e-07, "loss": 0.0, "reward": 3.4187498092651367, "reward_std": 0.36628347635269165, "rewards/accuracy_reward": 2.1187498569488525, "rewards/format_reward": 1.0, "step": 34, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 271.78125, "epoch": 0.05359877488514548, "grad_norm": 8.63810081218864, "kl": 0.0032196044921875, "learning_rate": 9.929283121282675e-07, "loss": 0.0, "reward": 3.25, "reward_std": 0.484809547662735, "rewards/accuracy_reward": 1.9500000476837158, "rewards/format_reward": 1.0, "step": 35, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 246.90625, "epoch": 0.055130168453292494, "grad_norm": 5.75915846803757, "kl": 0.004119873046875, "learning_rate": 9.925194678723557e-07, "loss": 0.0, "reward": 3.1187500953674316, "reward_std": 0.3506025969982147, "rewards/accuracy_reward": 1.8187501430511475, "rewards/format_reward": 1.0, "step": 36, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 217.3125, "epoch": 0.05666156202143951, "grad_norm": 21.5613186498873, "kl": 0.004913330078125, "learning_rate": 9.920992238501823e-07, "loss": 0.0, "reward": 3.9250001907348633, "reward_std": 0.48906704783439636, "rewards/accuracy_reward": 2.625, "rewards/format_reward": 1.0, "step": 37, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 244.5, "epoch": 0.05819295558958652, "grad_norm": 46.91695755260845, "kl": 0.00445556640625, "learning_rate": 9.916675897886394e-07, "loss": 0.0, "reward": 3.518749952316284, "reward_std": 0.4832785129547119, "rewards/accuracy_reward": 2.293750047683716, "rewards/format_reward": 1.0, "step": 38, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 227.8125, "epoch": 0.05972434915773354, "grad_norm": 7.145992982612995, "kl": 0.006683349609375, "learning_rate": 9.912245756782507e-07, "loss": 0.0, "reward": 3.1312499046325684, "reward_std": 0.4973461627960205, "rewards/accuracy_reward": 1.9062501192092896, "rewards/format_reward": 1.0, "step": 39, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 269.8125, "epoch": 0.06125574272588055, "grad_norm": 4.172187313214933, "kl": 0.0057373046875, "learning_rate": 9.9077019177294e-07, "loss": 0.0, "reward": 2.9437499046325684, "reward_std": 0.392461895942688, "rewards/accuracy_reward": 1.6437499523162842, "rewards/format_reward": 1.0, "step": 40, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 229.15625, "epoch": 0.06278713629402756, "grad_norm": 10.998221333714971, "kl": 0.007354736328125, "learning_rate": 9.903044485897955e-07, "loss": 0.0, "reward": 3.3499999046325684, "reward_std": 0.46540987491607666, "rewards/accuracy_reward": 2.049999952316284, "rewards/format_reward": 1.0, "step": 41, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 192.8125, "epoch": 0.06431852986217458, "grad_norm": 12.155267553825752, "kl": 0.00628662109375, "learning_rate": 9.89827356908824e-07, "loss": 0.0, "reward": 3.424999952316284, "reward_std": 0.5948097109794617, "rewards/accuracy_reward": 2.125, "rewards/format_reward": 1.0, "step": 42, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 250.28125, "epoch": 0.06584992343032159, "grad_norm": 5.325364528226017, "kl": 0.0048828125, "learning_rate": 9.89338927772703e-07, "loss": 0.0, "reward": 2.84375, "reward_std": 0.35478055477142334, "rewards/accuracy_reward": 1.5437500476837158, "rewards/format_reward": 1.0, "step": 43, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 212.8125, "epoch": 0.06738131699846861, "grad_norm": 16.303494875871205, "kl": 0.00958251953125, "learning_rate": 9.888391724865245e-07, "loss": 0.0, "reward": 2.9375, "reward_std": 0.3671438694000244, "rewards/accuracy_reward": 1.712499976158142, "rewards/format_reward": 1.0, "step": 44, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 207.59375, "epoch": 0.06891271056661562, "grad_norm": 10.183512800331151, "kl": 0.0079345703125, "learning_rate": 9.88328102617534e-07, "loss": 0.0, "reward": 3.2562499046325684, "reward_std": 0.332864373922348, "rewards/accuracy_reward": 2.03125, "rewards/format_reward": 1.0, "step": 45, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 271.9375, "epoch": 0.07044410413476264, "grad_norm": 5.120622299709726, "kl": 0.007080078125, "learning_rate": 9.87805729994862e-07, "loss": 0.0, "reward": 3.112499952316284, "reward_std": 0.562711238861084, "rewards/accuracy_reward": 1.8125, "rewards/format_reward": 1.0, "step": 46, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 237.6875, "epoch": 0.07197549770290965, "grad_norm": 4.367373681303007, "kl": 0.00701904296875, "learning_rate": 9.872720667092505e-07, "loss": 0.0, "reward": 2.9937500953674316, "reward_std": 0.48836949467658997, "rewards/accuracy_reward": 1.693750023841858, "rewards/format_reward": 1.0, "step": 47, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 256.25, "epoch": 0.07350689127105667, "grad_norm": 16.103884519005483, "kl": 0.006805419921875, "learning_rate": 9.867271251127727e-07, "loss": 0.0, "reward": 3.3125, "reward_std": 0.3035011291503906, "rewards/accuracy_reward": 2.012500047683716, "rewards/format_reward": 1.0, "step": 48, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 231.03125, "epoch": 0.07503828483920368, "grad_norm": 8.788502438995774, "kl": 0.0089111328125, "learning_rate": 9.861709178185483e-07, "loss": 0.0, "reward": 3.2437498569488525, "reward_std": 0.5186760425567627, "rewards/accuracy_reward": 2.018749952316284, "rewards/format_reward": 1.0, "step": 49, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 243.125, "epoch": 0.07656967840735068, "grad_norm": 4.596890246898075, "kl": 0.0096435546875, "learning_rate": 9.856034577004504e-07, "loss": 0.0, "reward": 3.081249713897705, "reward_std": 0.4919166564941406, "rewards/accuracy_reward": 1.78125, "rewards/format_reward": 1.0, "step": 50, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 305.6875, "epoch": 0.0781010719754977, "grad_norm": 13.961517391453988, "kl": 0.006011962890625, "learning_rate": 9.850247578928079e-07, "loss": 0.0, "reward": 3.53125, "reward_std": 0.5289031267166138, "rewards/accuracy_reward": 2.2312498092651367, "rewards/format_reward": 1.0, "step": 51, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 291.0625, "epoch": 0.07963246554364471, "grad_norm": 6.3692603502129845, "kl": 0.00714111328125, "learning_rate": 9.844348317901016e-07, "loss": 0.0, "reward": 3.0749998092651367, "reward_std": 0.36224496364593506, "rewards/accuracy_reward": 1.774999976158142, "rewards/format_reward": 1.0, "step": 52, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 252.3125, "epoch": 0.08116385911179173, "grad_norm": 6.412747585458219, "kl": 0.00823974609375, "learning_rate": 9.838336930466539e-07, "loss": 0.0, "reward": 3.1937499046325684, "reward_std": 0.4673565626144409, "rewards/accuracy_reward": 1.8937498331069946, "rewards/format_reward": 1.0, "step": 53, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 231.6875, "epoch": 0.08269525267993874, "grad_norm": 5.18721005404993, "kl": 0.0108642578125, "learning_rate": 9.832213555763134e-07, "loss": 0.0, "reward": 3.0187501907348633, "reward_std": 0.469946026802063, "rewards/accuracy_reward": 1.71875, "rewards/format_reward": 1.0, "step": 54, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 282.15625, "epoch": 0.08422664624808576, "grad_norm": 7.62980944648513, "kl": 0.00665283203125, "learning_rate": 9.82597833552132e-07, "loss": 0.0, "reward": 3.081249952316284, "reward_std": 0.5897158980369568, "rewards/accuracy_reward": 1.78125, "rewards/format_reward": 1.0, "step": 55, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 270.53125, "epoch": 0.08575803981623277, "grad_norm": 12.90972700975486, "kl": 0.0084228515625, "learning_rate": 9.819631414060372e-07, "loss": 0.0, "reward": 3.3812499046325684, "reward_std": 0.5733855962753296, "rewards/accuracy_reward": 2.081249952316284, "rewards/format_reward": 1.0, "step": 56, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 236.21875, "epoch": 0.08728943338437979, "grad_norm": 8.789318251801276, "kl": 0.00860595703125, "learning_rate": 9.813172938284986e-07, "loss": 0.0, "reward": 3.424999713897705, "reward_std": 0.5403081774711609, "rewards/accuracy_reward": 2.125, "rewards/format_reward": 1.0, "step": 57, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 256.125, "epoch": 0.0888208269525268, "grad_norm": 4.920611432887529, "kl": 0.0101318359375, "learning_rate": 9.806603057681868e-07, "loss": 0.0, "reward": 3.0999999046325684, "reward_std": 0.3320375978946686, "rewards/accuracy_reward": 1.7999999523162842, "rewards/format_reward": 1.0, "step": 58, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 266.78125, "epoch": 0.0903522205206738, "grad_norm": 6.933811561610319, "kl": 0.0098876953125, "learning_rate": 9.799921924316283e-07, "loss": 0.0, "reward": 3.5062499046325684, "reward_std": 0.5279309749603271, "rewards/accuracy_reward": 2.206249952316284, "rewards/format_reward": 1.0, "step": 59, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 250.03125, "epoch": 0.09188361408882083, "grad_norm": 6.437867895100921, "kl": 0.01080322265625, "learning_rate": 9.793129692828533e-07, "loss": 0.0, "reward": 2.9499998092651367, "reward_std": 0.3246353566646576, "rewards/accuracy_reward": 1.649999976158142, "rewards/format_reward": 1.0, "step": 60, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 275.25, "epoch": 0.09341500765696784, "grad_norm": 9.798775021650894, "kl": 0.009765625, "learning_rate": 9.786226520430374e-07, "loss": 0.0, "reward": 3.406249761581421, "reward_std": 0.5513333082199097, "rewards/accuracy_reward": 2.1812498569488525, "rewards/format_reward": 1.0, "step": 61, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 287.96875, "epoch": 0.09494640122511486, "grad_norm": 13.876179036973484, "kl": 0.01104736328125, "learning_rate": 9.779212566901385e-07, "loss": 0.0, "reward": 4.0625, "reward_std": 0.5208388566970825, "rewards/accuracy_reward": 2.762500047683716, "rewards/format_reward": 1.0, "step": 62, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 246.78125, "epoch": 0.09647779479326186, "grad_norm": 26.754853896720867, "kl": 0.0223388671875, "learning_rate": 9.77208799458526e-07, "loss": 0.0, "reward": 3.125, "reward_std": 0.3128255605697632, "rewards/accuracy_reward": 1.8249999284744263, "rewards/format_reward": 1.0, "step": 63, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 243.84375, "epoch": 0.09800918836140889, "grad_norm": 7.174348819697357, "kl": 0.01190185546875, "learning_rate": 9.76485296838606e-07, "loss": 0.0, "reward": 3.0687499046325684, "reward_std": 0.35120201110839844, "rewards/accuracy_reward": 1.7687500715255737, "rewards/format_reward": 1.0, "step": 64, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 244.0625, "epoch": 0.0995405819295559, "grad_norm": 5.899616368592248, "kl": 0.0130615234375, "learning_rate": 9.757507655764384e-07, "loss": 0.0, "reward": 2.96875, "reward_std": 0.3577200174331665, "rewards/accuracy_reward": 1.6687500476837158, "rewards/format_reward": 1.0, "step": 65, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 280.125, "epoch": 0.10107197549770292, "grad_norm": 5.308949567718103, "kl": 0.01043701171875, "learning_rate": 9.75005222673351e-07, "loss": 0.0, "reward": 3.4625000953674316, "reward_std": 0.5306740403175354, "rewards/accuracy_reward": 2.1624999046325684, "rewards/format_reward": 1.0, "step": 66, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 248.09375, "epoch": 0.10260336906584992, "grad_norm": 9.621365518129148, "kl": 0.01171875, "learning_rate": 9.742486853855444e-07, "loss": 0.0, "reward": 3.84375, "reward_std": 0.5204290151596069, "rewards/accuracy_reward": 2.5437498092651367, "rewards/format_reward": 1.0, "step": 67, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 247.0625, "epoch": 0.10413476263399694, "grad_norm": 15.544461775860299, "kl": 0.0135498046875, "learning_rate": 9.734811712236936e-07, "loss": 0.0, "reward": 3.549999952316284, "reward_std": 0.57508784532547, "rewards/accuracy_reward": 2.25, "rewards/format_reward": 1.0, "step": 68, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 282.90625, "epoch": 0.10566615620214395, "grad_norm": 6.616923976719133, "kl": 0.01116943359375, "learning_rate": 9.727026979525419e-07, "loss": 0.0, "reward": 2.893749952316284, "reward_std": 0.30293795466423035, "rewards/accuracy_reward": 1.5937498807907104, "rewards/format_reward": 1.0, "step": 69, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 253.1875, "epoch": 0.10719754977029096, "grad_norm": 5.12310463136158, "kl": 0.0162353515625, "learning_rate": 9.719132835904906e-07, "loss": 0.0, "reward": 3.28125, "reward_std": 0.566928505897522, "rewards/accuracy_reward": 1.9812498092651367, "rewards/format_reward": 1.0, "step": 70, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 270.625, "epoch": 0.10872894333843798, "grad_norm": 14.726605569458583, "kl": 0.01287841796875, "learning_rate": 9.711129464091814e-07, "loss": 0.0, "reward": 3.7312498092651367, "reward_std": 0.5318872928619385, "rewards/accuracy_reward": 2.4312500953674316, "rewards/format_reward": 1.0, "step": 71, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 293.78125, "epoch": 0.11026033690658499, "grad_norm": 4.47517871239952, "kl": 0.0120849609375, "learning_rate": 9.703017049330734e-07, "loss": 0.0, "reward": 2.768749952316284, "reward_std": 0.4168527126312256, "rewards/accuracy_reward": 1.46875, "rewards/format_reward": 1.0, "step": 72, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 234.3125, "epoch": 0.11179173047473201, "grad_norm": 7.696156798679741, "kl": 0.0164794921875, "learning_rate": 9.694795779390145e-07, "loss": 0.0, "reward": 3.0, "reward_std": 0.35017162561416626, "rewards/accuracy_reward": 1.7000000476837158, "rewards/format_reward": 1.0, "step": 73, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 285.28125, "epoch": 0.11332312404287902, "grad_norm": 27.537434750426346, "kl": 0.01446533203125, "learning_rate": 9.686465844558072e-07, "loss": 0.0, "reward": 3.125, "reward_std": 0.4312995672225952, "rewards/accuracy_reward": 1.8249999284744263, "rewards/format_reward": 1.0, "step": 74, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 285.96875, "epoch": 0.11485451761102604, "grad_norm": 5.471264754308217, "kl": 0.03466796875, "learning_rate": 9.678027437637677e-07, "loss": 0.0, "reward": 2.893749713897705, "reward_std": 0.3074049949645996, "rewards/accuracy_reward": 1.59375, "rewards/format_reward": 1.0, "step": 75, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 260.46875, "epoch": 0.11638591117917305, "grad_norm": 11.665474325673282, "kl": 0.018310546875, "learning_rate": 9.669480753942792e-07, "loss": 0.0, "reward": 4.068750381469727, "reward_std": 0.4616953730583191, "rewards/accuracy_reward": 2.7687501907348633, "rewards/format_reward": 1.0, "step": 76, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 278.4375, "epoch": 0.11791730474732007, "grad_norm": 6.153184210645685, "kl": 0.01348876953125, "learning_rate": 9.66082599129341e-07, "loss": 0.0, "reward": 3.28125, "reward_std": 0.4848785996437073, "rewards/accuracy_reward": 1.9812500476837158, "rewards/format_reward": 1.0, "step": 77, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 294.46875, "epoch": 0.11944869831546708, "grad_norm": 5.2430639141452415, "kl": 0.01300048828125, "learning_rate": 9.652063350011093e-07, "loss": 0.0, "reward": 2.9625000953674316, "reward_std": 0.3452790677547455, "rewards/accuracy_reward": 1.6625001430511475, "rewards/format_reward": 1.0, "step": 78, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 291.8125, "epoch": 0.12098009188361408, "grad_norm": 119.37951094933942, "kl": 0.01708984375, "learning_rate": 9.643193032914353e-07, "loss": 0.0, "reward": 3.1624999046325684, "reward_std": 0.44167351722717285, "rewards/accuracy_reward": 1.8624999523162842, "rewards/format_reward": 1.0, "step": 79, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 273.53125, "epoch": 0.1225114854517611, "grad_norm": 44.72540517746883, "kl": 0.0125732421875, "learning_rate": 9.634215245313939e-07, "loss": 0.0, "reward": 3.325000047683716, "reward_std": 0.4470377266407013, "rewards/accuracy_reward": 2.0250000953674316, "rewards/format_reward": 1.0, "step": 80, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 302.15625, "epoch": 0.12404287901990811, "grad_norm": 8.886709452383517, "kl": 0.017333984375, "learning_rate": 9.62513019500809e-07, "loss": 0.0, "reward": 3.3062498569488525, "reward_std": 0.4055883288383484, "rewards/accuracy_reward": 2.0062501430511475, "rewards/format_reward": 1.0, "step": 81, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 273.40625, "epoch": 0.12557427258805512, "grad_norm": 7.433627344353906, "kl": 0.01422119140625, "learning_rate": 9.615938092277739e-07, "loss": 0.0, "reward": 3.731250047683716, "reward_std": 0.6632749438285828, "rewards/accuracy_reward": 2.5062499046325684, "rewards/format_reward": 1.0, "step": 82, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 295.53125, "epoch": 0.12710566615620214, "grad_norm": 5.928349284260465, "kl": 0.01544189453125, "learning_rate": 9.606639149881621e-07, "loss": 0.0, "reward": 3.174999952316284, "reward_std": 0.4570466876029968, "rewards/accuracy_reward": 1.9500000476837158, "rewards/format_reward": 1.0, "step": 83, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 264.09375, "epoch": 0.12863705972434916, "grad_norm": 24.753344046181738, "kl": 0.0159912109375, "learning_rate": 9.597233583051376e-07, "loss": 0.0, "reward": 3.0749998092651367, "reward_std": 0.4688183665275574, "rewards/accuracy_reward": 1.850000023841858, "rewards/format_reward": 1.0, "step": 84, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 230.8125, "epoch": 0.13016845329249618, "grad_norm": 16.606939449922436, "kl": 0.0205078125, "learning_rate": 9.587721609486543e-07, "loss": 0.0, "reward": 3.59375, "reward_std": 0.5050134658813477, "rewards/accuracy_reward": 2.2937498092651367, "rewards/format_reward": 1.0, "step": 85, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 276.125, "epoch": 0.13169984686064318, "grad_norm": 6.494004286855659, "kl": 0.0130615234375, "learning_rate": 9.57810344934954e-07, "loss": 0.0, "reward": 3.40625, "reward_std": 0.4114701449871063, "rewards/accuracy_reward": 2.106250047683716, "rewards/format_reward": 1.0, "step": 86, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 283.84375, "epoch": 0.1332312404287902, "grad_norm": 9.009922489811581, "kl": 0.0123291015625, "learning_rate": 9.568379325260556e-07, "loss": 0.0, "reward": 3.5874998569488525, "reward_std": 0.48859333992004395, "rewards/accuracy_reward": 2.2875001430511475, "rewards/format_reward": 1.0, "step": 87, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 265.0, "epoch": 0.13476263399693722, "grad_norm": 9.739186172548235, "kl": 0.0159912109375, "learning_rate": 9.558549462292402e-07, "loss": 0.0, "reward": 3.5812501907348633, "reward_std": 0.47442397475242615, "rewards/accuracy_reward": 2.28125, "rewards/format_reward": 1.0, "step": 88, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 262.96875, "epoch": 0.1362940275650842, "grad_norm": 7.990348369284193, "kl": 0.0179443359375, "learning_rate": 9.548614087965304e-07, "loss": 0.0, "reward": 3.5249998569488525, "reward_std": 0.5649663209915161, "rewards/accuracy_reward": 2.299999952316284, "rewards/format_reward": 1.0, "step": 89, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 284.6875, "epoch": 0.13782542113323124, "grad_norm": 19.39633992166886, "kl": 0.01519775390625, "learning_rate": 9.538573432241637e-07, "loss": 0.0, "reward": 2.956249952316284, "reward_std": 0.3850908875465393, "rewards/accuracy_reward": 1.65625, "rewards/format_reward": 1.0, "step": 90, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 295.28125, "epoch": 0.13935681470137826, "grad_norm": 7.662501804425234, "kl": 0.01470947265625, "learning_rate": 9.528427727520591e-07, "loss": 0.0, "reward": 3.4937498569488525, "reward_std": 0.5627257227897644, "rewards/accuracy_reward": 2.1937501430511475, "rewards/format_reward": 1.0, "step": 91, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 237.25, "epoch": 0.14088820826952528, "grad_norm": 8.613477189912425, "kl": 0.020263671875, "learning_rate": 9.518177208632812e-07, "loss": 0.0, "reward": 3.6999998092651367, "reward_std": 0.6943286657333374, "rewards/accuracy_reward": 2.3999998569488525, "rewards/format_reward": 1.0, "step": 92, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 266.53125, "epoch": 0.14241960183767227, "grad_norm": 5.553773740185143, "kl": 0.0169677734375, "learning_rate": 9.507822112834946e-07, "loss": 0.0, "reward": 2.90625, "reward_std": 0.40452033281326294, "rewards/accuracy_reward": 1.681249976158142, "rewards/format_reward": 1.0, "step": 93, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 286.71875, "epoch": 0.1439509954058193, "grad_norm": 4.150148471818182, "kl": 0.016357421875, "learning_rate": 9.497362679804168e-07, "loss": 0.0, "reward": 3.4937500953674316, "reward_std": 0.4619887173175812, "rewards/accuracy_reward": 2.1937501430511475, "rewards/format_reward": 1.0, "step": 94, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 255.875, "epoch": 0.14548238897396631, "grad_norm": 32.823331085292175, "kl": 0.017578125, "learning_rate": 9.486799151632612e-07, "loss": 0.0, "reward": 3.5562498569488525, "reward_std": 0.8575383424758911, "rewards/accuracy_reward": 2.3312501907348633, "rewards/format_reward": 1.0, "step": 95, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 303.8125, "epoch": 0.14701378254211334, "grad_norm": 3.9194088008606944, "kl": 0.0166015625, "learning_rate": 9.47613177282179e-07, "loss": 0.0, "reward": 3.8000001907348633, "reward_std": 0.5585123300552368, "rewards/accuracy_reward": 2.5, "rewards/format_reward": 1.0, "step": 96, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 274.09375, "epoch": 0.14854517611026033, "grad_norm": 14.355949768888078, "kl": 0.019287109375, "learning_rate": 9.465360790276911e-07, "loss": 0.0, "reward": 3.3999998569488525, "reward_std": 0.5401572585105896, "rewards/accuracy_reward": 2.0999999046325684, "rewards/format_reward": 1.0, "step": 97, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 273.28125, "epoch": 0.15007656967840735, "grad_norm": 102.92002021277533, "kl": 0.01806640625, "learning_rate": 9.454486453301189e-07, "loss": 0.0, "reward": 3.231250047683716, "reward_std": 0.5214991569519043, "rewards/accuracy_reward": 1.931249976158142, "rewards/format_reward": 1.0, "step": 98, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 262.78125, "epoch": 0.15160796324655437, "grad_norm": 4.0363473222679, "kl": 0.017578125, "learning_rate": 9.44350901359005e-07, "loss": 0.0, "reward": 3.3812499046325684, "reward_std": 0.5377869606018066, "rewards/accuracy_reward": 2.15625, "rewards/format_reward": 1.0, "step": 99, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 336.25, "epoch": 0.15313935681470137, "grad_norm": 5.493115558081385, "kl": 0.01470947265625, "learning_rate": 9.432428725225326e-07, "loss": 0.0, "reward": 2.8812499046325684, "reward_std": 0.5311998128890991, "rewards/accuracy_reward": 1.5812499523162842, "rewards/format_reward": 1.0, "step": 100, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 280.71875, "epoch": 0.1546707503828484, "grad_norm": 16.20136610259414, "kl": 0.020263671875, "learning_rate": 9.421245844669361e-07, "loss": 0.0, "reward": 3.03125, "reward_std": 0.361322820186615, "rewards/accuracy_reward": 1.7312500476837158, "rewards/format_reward": 1.0, "step": 101, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 247.65625, "epoch": 0.1562021439509954, "grad_norm": 12.026042915386354, "kl": 0.0234375, "learning_rate": 9.409960630759078e-07, "loss": 0.0, "reward": 3.75, "reward_std": 0.6079727411270142, "rewards/accuracy_reward": 2.450000047683716, "rewards/format_reward": 1.0, "step": 102, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 237.5625, "epoch": 0.15773353751914243, "grad_norm": 56.156360481293405, "kl": 0.021240234375, "learning_rate": 9.398573344699992e-07, "loss": 0.0, "reward": 3.706249713897705, "reward_std": 0.4153931140899658, "rewards/accuracy_reward": 2.40625, "rewards/format_reward": 1.0, "step": 103, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 240.21875, "epoch": 0.15926493108728942, "grad_norm": 9.033204445215723, "kl": 0.0238037109375, "learning_rate": 9.387084250060162e-07, "loss": 0.0, "reward": 3.581249952316284, "reward_std": 0.43426191806793213, "rewards/accuracy_reward": 2.3562498092651367, "rewards/format_reward": 1.0, "step": 104, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 258.65625, "epoch": 0.16079632465543645, "grad_norm": 9.214387456879582, "kl": 0.022705078125, "learning_rate": 9.375493612764085e-07, "loss": 0.0, "reward": 3.1312499046325684, "reward_std": 0.5742160081863403, "rewards/accuracy_reward": 1.8312499523162842, "rewards/format_reward": 1.0, "step": 105, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 274.09375, "epoch": 0.16232771822358347, "grad_norm": 6.527925149967669, "kl": 0.02392578125, "learning_rate": 9.363801701086554e-07, "loss": 0.0, "reward": 3.1937499046325684, "reward_std": 0.3148888349533081, "rewards/accuracy_reward": 1.8937500715255737, "rewards/format_reward": 1.0, "step": 106, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 286.9375, "epoch": 0.1638591117917305, "grad_norm": 9.874956521245124, "kl": 0.0213623046875, "learning_rate": 9.35200878564643e-07, "loss": 0.0, "reward": 3.081249952316284, "reward_std": 0.5441991090774536, "rewards/accuracy_reward": 1.8562500476837158, "rewards/format_reward": 1.0, "step": 107, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 243.1875, "epoch": 0.16539050535987748, "grad_norm": 30.178604262517098, "kl": 0.02294921875, "learning_rate": 9.340115139400399e-07, "loss": 0.0, "reward": 3.0562500953674316, "reward_std": 0.2912874221801758, "rewards/accuracy_reward": 1.756250023841858, "rewards/format_reward": 1.0, "step": 108, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 264.375, "epoch": 0.1669218989280245, "grad_norm": 11.310326494662677, "kl": 0.019775390625, "learning_rate": 9.32812103763664e-07, "loss": 0.0, "reward": 3.3687498569488525, "reward_std": 0.4435497522354126, "rewards/accuracy_reward": 2.0687499046325684, "rewards/format_reward": 1.0, "step": 109, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 247.0625, "epoch": 0.16845329249617153, "grad_norm": 4.252098043893597, "kl": 0.019287109375, "learning_rate": 9.316026757968454e-07, "loss": 0.0, "reward": 3.831249952316284, "reward_std": 0.5273396372795105, "rewards/accuracy_reward": 2.53125, "rewards/format_reward": 1.0, "step": 110, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 268.46875, "epoch": 0.16998468606431852, "grad_norm": 6.7966917684785395, "kl": 0.0205078125, "learning_rate": 9.303832580327844e-07, "loss": 0.0, "reward": 3.90625, "reward_std": 0.5165694952011108, "rewards/accuracy_reward": 2.606250047683716, "rewards/format_reward": 1.0, "step": 111, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 267.90625, "epoch": 0.17151607963246554, "grad_norm": 9.197438213715376, "kl": 0.019775390625, "learning_rate": 9.291538786959037e-07, "loss": 0.0, "reward": 3.1499998569488525, "reward_std": 0.4710281193256378, "rewards/accuracy_reward": 1.9250000715255737, "rewards/format_reward": 1.0, "step": 112, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 291.0, "epoch": 0.17304747320061256, "grad_norm": 9.499800643366168, "kl": 0.0203857421875, "learning_rate": 9.279145662411941e-07, "loss": 0.0, "reward": 2.8625001907348633, "reward_std": 0.39233559370040894, "rewards/accuracy_reward": 1.6375000476837158, "rewards/format_reward": 1.0, "step": 113, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 256.46875, "epoch": 0.17457886676875958, "grad_norm": 9.624750095041502, "kl": 0.0224609375, "learning_rate": 9.26665349353557e-07, "loss": 0.0, "reward": 3.843750238418579, "reward_std": 0.6045280694961548, "rewards/accuracy_reward": 2.6187500953674316, "rewards/format_reward": 1.0, "step": 114, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 351.21875, "epoch": 0.17611026033690658, "grad_norm": 4.098513181220432, "kl": 0.0166015625, "learning_rate": 9.2540625694714e-07, "loss": 0.0, "reward": 3.2125000953674316, "reward_std": 0.3509003221988678, "rewards/accuracy_reward": 1.9125001430511475, "rewards/format_reward": 1.0, "step": 115, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 261.84375, "epoch": 0.1776416539050536, "grad_norm": 5.603893898192362, "kl": 0.02197265625, "learning_rate": 9.241373181646671e-07, "loss": 0.0, "reward": 2.9312498569488525, "reward_std": 0.3804709315299988, "rewards/accuracy_reward": 1.631250023841858, "rewards/format_reward": 1.0, "step": 116, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 314.75, "epoch": 0.17917304747320062, "grad_norm": 11.740130042508287, "kl": 0.0185546875, "learning_rate": 9.228585623767658e-07, "loss": 0.0, "reward": 2.8812499046325684, "reward_std": 0.4420754909515381, "rewards/accuracy_reward": 1.5812499523162842, "rewards/format_reward": 1.0, "step": 117, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 275.53125, "epoch": 0.1807044410413476, "grad_norm": 4.922266982609114, "kl": 0.021240234375, "learning_rate": 9.21570019181285e-07, "loss": 0.0, "reward": 3.624999761581421, "reward_std": 0.5486506819725037, "rewards/accuracy_reward": 2.3249998092651367, "rewards/format_reward": 1.0, "step": 118, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 272.8125, "epoch": 0.18223583460949463, "grad_norm": 4.493970548638196, "kl": 0.0233154296875, "learning_rate": 9.202717184026123e-07, "loss": 0.0, "reward": 3.7437500953674316, "reward_std": 0.3937029242515564, "rewards/accuracy_reward": 2.4437499046325684, "rewards/format_reward": 1.0, "step": 119, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 286.40625, "epoch": 0.18376722817764166, "grad_norm": 4.869676485468556, "kl": 0.019287109375, "learning_rate": 9.189636900909817e-07, "loss": 0.0, "reward": 3.9000000953674316, "reward_std": 0.49319151043891907, "rewards/accuracy_reward": 2.5999999046325684, "rewards/format_reward": 1.0, "step": 120, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.625, "epoch": 0.18529862174578868, "grad_norm": 6.776203629581503, "kl": 0.01708984375, "learning_rate": 9.176459645217794e-07, "loss": 0.0, "reward": 3.0999999046325684, "reward_std": 0.3274829685688019, "rewards/accuracy_reward": 1.7999999523162842, "rewards/format_reward": 1.0, "step": 121, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 288.625, "epoch": 0.18683001531393567, "grad_norm": 5.656254719786936, "kl": 0.0234375, "learning_rate": 9.163185721948421e-07, "loss": 0.0, "reward": 3.1187498569488525, "reward_std": 0.35278022289276123, "rewards/accuracy_reward": 1.818750023841858, "rewards/format_reward": 1.0, "step": 122, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 253.0, "epoch": 0.1883614088820827, "grad_norm": 4.6916404290395075, "kl": 0.0240478515625, "learning_rate": 9.14981543833752e-07, "loss": 0.0, "reward": 4.212500095367432, "reward_std": 0.45350727438926697, "rewards/accuracy_reward": 2.9124999046325684, "rewards/format_reward": 1.0, "step": 123, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 289.3125, "epoch": 0.18989280245022971, "grad_norm": 14.783330097461596, "kl": 0.020263671875, "learning_rate": 9.136349103851252e-07, "loss": 0.0, "reward": 2.8999998569488525, "reward_std": 0.4208742380142212, "rewards/accuracy_reward": 1.600000023841858, "rewards/format_reward": 1.0, "step": 124, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 316.3125, "epoch": 0.19142419601837674, "grad_norm": 3.294759506674053, "kl": 0.0238037109375, "learning_rate": 9.122787030178949e-07, "loss": 0.0, "reward": 3.4937500953674316, "reward_std": 0.5771903991699219, "rewards/accuracy_reward": 2.1937499046325684, "rewards/format_reward": 1.0, "step": 125, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 270.0, "epoch": 0.19295558958652373, "grad_norm": 5.780643239892506, "kl": 0.022705078125, "learning_rate": 9.10912953122591e-07, "loss": 0.0, "reward": 3.4499998092651367, "reward_std": 0.5633983016014099, "rewards/accuracy_reward": 2.1500000953674316, "rewards/format_reward": 1.0, "step": 126, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 324.75, "epoch": 0.19448698315467075, "grad_norm": 4.234196031996337, "kl": 0.0208740234375, "learning_rate": 9.095376923106129e-07, "loss": 0.0, "reward": 3.0562500953674316, "reward_std": 0.3289462924003601, "rewards/accuracy_reward": 1.756250023841858, "rewards/format_reward": 1.0, "step": 127, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 270.90625, "epoch": 0.19601837672281777, "grad_norm": 11.190260547131228, "kl": 0.0262451171875, "learning_rate": 9.081529524134975e-07, "loss": 0.0, "reward": 3.4937500953674316, "reward_std": 0.47610414028167725, "rewards/accuracy_reward": 2.1937499046325684, "rewards/format_reward": 1.0, "step": 128, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 288.0, "epoch": 0.19754977029096477, "grad_norm": 6.360658284575685, "kl": 0.0240478515625, "learning_rate": 9.067587654821837e-07, "loss": 0.0, "reward": 3.043750047683716, "reward_std": 0.4341242015361786, "rewards/accuracy_reward": 1.743749976158142, "rewards/format_reward": 1.0, "step": 129, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 296.5625, "epoch": 0.1990811638591118, "grad_norm": 4.296191720894978, "kl": 0.020263671875, "learning_rate": 9.053551637862692e-07, "loss": 0.0, "reward": 3.4124999046325684, "reward_std": 0.5241235494613647, "rewards/accuracy_reward": 2.1125001907348633, "rewards/format_reward": 1.0, "step": 130, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 303.53125, "epoch": 0.2006125574272588, "grad_norm": 8.514387720273588, "kl": 0.0238037109375, "learning_rate": 9.03942179813264e-07, "loss": 0.0, "reward": 3.393749713897705, "reward_std": 0.42237094044685364, "rewards/accuracy_reward": 2.09375, "rewards/format_reward": 1.0, "step": 131, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 303.21875, "epoch": 0.20214395099540583, "grad_norm": 41.89008793742567, "kl": 0.0228271484375, "learning_rate": 9.025198462678392e-07, "loss": 0.0, "reward": 3.0437498092651367, "reward_std": 0.44920703768730164, "rewards/accuracy_reward": 1.743749976158142, "rewards/format_reward": 1.0, "step": 132, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 300.75, "epoch": 0.20367534456355282, "grad_norm": 7.194974348985283, "kl": 0.0244140625, "learning_rate": 9.010881960710688e-07, "loss": 0.0, "reward": 3.4437499046325684, "reward_std": 0.43165701627731323, "rewards/accuracy_reward": 2.21875, "rewards/format_reward": 1.0, "step": 133, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 283.5, "epoch": 0.20520673813169985, "grad_norm": 10.06890479049601, "kl": 0.0252685546875, "learning_rate": 8.996472623596687e-07, "loss": 0.0, "reward": 3.6125001907348633, "reward_std": 0.7025853395462036, "rewards/accuracy_reward": 2.3874998092651367, "rewards/format_reward": 1.0, "step": 134, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 296.3125, "epoch": 0.20673813169984687, "grad_norm": 12.37502990164126, "kl": 0.0247802734375, "learning_rate": 8.98197078485229e-07, "loss": 0.0, "reward": 3.6374998092651367, "reward_std": 0.5983169078826904, "rewards/accuracy_reward": 2.4125001430511475, "rewards/format_reward": 1.0, "step": 135, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 323.75, "epoch": 0.2082695252679939, "grad_norm": 5.657135150692953, "kl": 0.0257568359375, "learning_rate": 8.967376780134426e-07, "loss": 0.0, "reward": 2.8312501907348633, "reward_std": 0.35583776235580444, "rewards/accuracy_reward": 1.53125, "rewards/format_reward": 1.0, "step": 136, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 281.625, "epoch": 0.20980091883614088, "grad_norm": 4.380349757952363, "kl": 0.029541015625, "learning_rate": 8.952690947233284e-07, "loss": 0.0, "reward": 3.4437501430511475, "reward_std": 0.32703521847724915, "rewards/accuracy_reward": 2.21875, "rewards/format_reward": 1.0, "step": 137, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 312.25, "epoch": 0.2113323124042879, "grad_norm": 8.823992736109716, "kl": 0.02587890625, "learning_rate": 8.937913626064486e-07, "loss": 0.0, "reward": 3.2937498092651367, "reward_std": 0.3964410424232483, "rewards/accuracy_reward": 2.0687499046325684, "rewards/format_reward": 1.0, "step": 138, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 308.3125, "epoch": 0.21286370597243492, "grad_norm": 12.102631268395426, "kl": 0.0272216796875, "learning_rate": 8.923045158661226e-07, "loss": 0.0, "reward": 3.6312499046325684, "reward_std": 0.4014958143234253, "rewards/accuracy_reward": 2.331249952316284, "rewards/format_reward": 1.0, "step": 139, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 271.65625, "epoch": 0.21439509954058192, "grad_norm": 7.913428989372605, "kl": 0.0242919921875, "learning_rate": 8.908085889166357e-07, "loss": 0.0, "reward": 4.025000095367432, "reward_std": 0.4050452709197998, "rewards/accuracy_reward": 2.7250001430511475, "rewards/format_reward": 1.0, "step": 140, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 284.5, "epoch": 0.21592649310872894, "grad_norm": 4.051622799305994, "kl": 0.031494140625, "learning_rate": 8.893036163824414e-07, "loss": 0.0, "reward": 3.5250000953674316, "reward_std": 0.2405874878168106, "rewards/accuracy_reward": 2.2249999046325684, "rewards/format_reward": 1.0, "step": 141, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 282.21875, "epoch": 0.21745788667687596, "grad_norm": 15.525922723951407, "kl": 0.029052734375, "learning_rate": 8.877896330973611e-07, "loss": 0.0, "reward": 3.7874999046325684, "reward_std": 0.6023781299591064, "rewards/accuracy_reward": 2.562500238418579, "rewards/format_reward": 1.0, "step": 142, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 290.625, "epoch": 0.21898928024502298, "grad_norm": 11.53714498993186, "kl": 0.025390625, "learning_rate": 8.862666741037772e-07, "loss": 0.0, "reward": 3.2562499046325684, "reward_std": 0.3409149646759033, "rewards/accuracy_reward": 1.9562500715255737, "rewards/format_reward": 1.0, "step": 143, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 321.5, "epoch": 0.22052067381316998, "grad_norm": 6.5118421682140655, "kl": 0.0262451171875, "learning_rate": 8.847347746518226e-07, "loss": 0.0, "reward": 3.53125, "reward_std": 0.5107113122940063, "rewards/accuracy_reward": 2.2312498092651367, "rewards/format_reward": 1.0, "step": 144, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 301.53125, "epoch": 0.222052067381317, "grad_norm": 9.837705570386271, "kl": 0.03076171875, "learning_rate": 8.831939701985636e-07, "loss": 0.0, "reward": 3.5249998569488525, "reward_std": 0.5034339427947998, "rewards/accuracy_reward": 2.2249999046325684, "rewards/format_reward": 1.0, "step": 145, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 310.84375, "epoch": 0.22358346094946402, "grad_norm": 5.862107982116596, "kl": 0.03173828125, "learning_rate": 8.81644296407181e-07, "loss": 0.0, "reward": 3.0875000953674316, "reward_std": 0.3950956463813782, "rewards/accuracy_reward": 1.787500023841858, "rewards/format_reward": 1.0, "step": 146, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 345.40625, "epoch": 0.225114854517611, "grad_norm": 8.708867217195756, "kl": 0.028076171875, "learning_rate": 8.800857891461433e-07, "loss": 0.0, "reward": 3.0999999046325684, "reward_std": 0.3657301366329193, "rewards/accuracy_reward": 1.7999999523162842, "rewards/format_reward": 1.0, "step": 147, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 400.0, "epoch": 0.22664624808575803, "grad_norm": 6.510285798481118, "kl": 0.022216796875, "learning_rate": 8.785184844883766e-07, "loss": 0.0, "reward": 3.487499952316284, "reward_std": 0.3941337466239929, "rewards/accuracy_reward": 2.1875, "rewards/format_reward": 1.0, "step": 148, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 264.15625, "epoch": 0.22817764165390506, "grad_norm": 6.298316940368753, "kl": 0.031494140625, "learning_rate": 8.769424187104302e-07, "loss": 0.0, "reward": 3.8187499046325684, "reward_std": 0.49747228622436523, "rewards/accuracy_reward": 2.518749713897705, "rewards/format_reward": 1.0, "step": 149, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 327.9375, "epoch": 0.22970903522205208, "grad_norm": 80.96139230971517, "kl": 0.02978515625, "learning_rate": 8.75357628291637e-07, "loss": 0.0, "reward": 3.637500047683716, "reward_std": 0.42090892791748047, "rewards/accuracy_reward": 2.3375000953674316, "rewards/format_reward": 1.0, "step": 150, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 294.09375, "epoch": 0.23124042879019907, "grad_norm": 5.201330390482463, "kl": 0.0311279296875, "learning_rate": 8.737641499132681e-07, "loss": 0.0, "reward": 2.9437499046325684, "reward_std": 0.14110496640205383, "rewards/accuracy_reward": 1.6437499523162842, "rewards/format_reward": 1.0, "step": 151, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 291.09375, "epoch": 0.2327718223583461, "grad_norm": 6.115410440044349, "kl": 0.0281982421875, "learning_rate": 8.721620204576856e-07, "loss": 0.0, "reward": 3.1937499046325684, "reward_std": 0.3890814781188965, "rewards/accuracy_reward": 1.96875, "rewards/format_reward": 1.0, "step": 152, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 320.3125, "epoch": 0.2343032159264931, "grad_norm": 5.4394369395935565, "kl": 0.032470703125, "learning_rate": 8.705512770074868e-07, "loss": 0.0, "reward": 3.4375, "reward_std": 0.4925556480884552, "rewards/accuracy_reward": 2.1374998092651367, "rewards/format_reward": 1.0, "step": 153, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 293.59375, "epoch": 0.23583460949464014, "grad_norm": 17.260831202111817, "kl": 0.028564453125, "learning_rate": 8.689319568446474e-07, "loss": 0.0, "reward": 3.5124998092651367, "reward_std": 0.4656377136707306, "rewards/accuracy_reward": 2.2874999046325684, "rewards/format_reward": 1.0, "step": 154, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 281.90625, "epoch": 0.23736600306278713, "grad_norm": 3.8749629849633522, "kl": 0.033203125, "learning_rate": 8.673040974496584e-07, "loss": 0.0, "reward": 3.6999998092651367, "reward_std": 0.3916766345500946, "rewards/accuracy_reward": 2.4000000953674316, "rewards/format_reward": 1.0, "step": 155, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 282.59375, "epoch": 0.23889739663093415, "grad_norm": 139.35810615334432, "kl": 0.0322265625, "learning_rate": 8.656677365006579e-07, "loss": 0.0, "reward": 3.768749952316284, "reward_std": 0.4740249514579773, "rewards/accuracy_reward": 2.46875, "rewards/format_reward": 1.0, "step": 156, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 244.125, "epoch": 0.24042879019908117, "grad_norm": 10.567534138609691, "kl": 0.03125, "learning_rate": 8.640229118725595e-07, "loss": 0.0, "reward": 3.8187499046325684, "reward_std": 0.6347061395645142, "rewards/accuracy_reward": 2.5187501907348633, "rewards/format_reward": 1.0, "step": 157, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 300.125, "epoch": 0.24196018376722817, "grad_norm": 8.106437182792291, "kl": 0.029541015625, "learning_rate": 8.62369661636176e-07, "loss": 0.0, "reward": 2.75, "reward_std": 0.27621468901634216, "rewards/accuracy_reward": 1.600000023841858, "rewards/format_reward": 1.0, "step": 158, "temporal_rewards": 0.5 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 298.53125, "epoch": 0.2434915773353752, "grad_norm": 4.066291402520282, "kl": 0.023681640625, "learning_rate": 8.607080240573372e-07, "loss": 0.0, "reward": 4.181249618530273, "reward_std": 0.4398733973503113, "rewards/accuracy_reward": 2.8812501430511475, "rewards/format_reward": 1.0, "step": 159, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 264.6875, "epoch": 0.2450229709035222, "grad_norm": 9.494494792809737, "kl": 0.03173828125, "learning_rate": 8.590380375960053e-07, "loss": 0.0, "reward": 3.8125, "reward_std": 0.47410985827445984, "rewards/accuracy_reward": 2.5124998092651367, "rewards/format_reward": 1.0, "step": 160, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 336.1875, "epoch": 0.24655436447166923, "grad_norm": 5.777129184291776, "kl": 0.0279541015625, "learning_rate": 8.573597409053837e-07, "loss": 0.0, "reward": 3.28125, "reward_std": 0.570157527923584, "rewards/accuracy_reward": 1.9812499284744263, "rewards/format_reward": 1.0, "step": 161, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 313.21875, "epoch": 0.24808575803981622, "grad_norm": 13.152237866980391, "kl": 0.02734375, "learning_rate": 8.556731728310234e-07, "loss": 0.0, "reward": 3.3499999046325684, "reward_std": 0.29259437322616577, "rewards/accuracy_reward": 2.049999952316284, "rewards/format_reward": 1.0, "step": 162, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 293.625, "epoch": 0.24961715160796324, "grad_norm": 10.763469615081304, "kl": 0.031005859375, "learning_rate": 8.53978372409923e-07, "loss": 0.0, "reward": 3.34375, "reward_std": 0.37417662143707275, "rewards/accuracy_reward": 2.0437498092651367, "rewards/format_reward": 1.0, "step": 163, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 283.375, "epoch": 0.25114854517611024, "grad_norm": 5.323834123530134, "kl": 0.0283203125, "learning_rate": 8.522753788696258e-07, "loss": 0.0, "reward": 3.4937500953674316, "reward_std": 0.5279685854911804, "rewards/accuracy_reward": 2.268749952316284, "rewards/format_reward": 1.0, "step": 164, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 321.75, "epoch": 0.25267993874425726, "grad_norm": 6.031806927830471, "kl": 0.027099609375, "learning_rate": 8.505642316273111e-07, "loss": 0.0, "reward": 3.0625, "reward_std": 0.4299696683883667, "rewards/accuracy_reward": 1.8375000953674316, "rewards/format_reward": 1.0, "step": 165, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 286.40625, "epoch": 0.2542113323124043, "grad_norm": 11.240961051487723, "kl": 0.0291748046875, "learning_rate": 8.488449702888827e-07, "loss": 0.0, "reward": 3.2125000953674316, "reward_std": 0.40743768215179443, "rewards/accuracy_reward": 1.912500023841858, "rewards/format_reward": 1.0, "step": 166, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 258.0625, "epoch": 0.2557427258805513, "grad_norm": 9.195739213624087, "kl": 0.03271484375, "learning_rate": 8.471176346480517e-07, "loss": 0.0, "reward": 3.893749713897705, "reward_std": 0.4122272729873657, "rewards/accuracy_reward": 2.59375, "rewards/format_reward": 1.0, "step": 167, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 318.25, "epoch": 0.2572741194486983, "grad_norm": 8.215813439305336, "kl": 0.02978515625, "learning_rate": 8.453822646854154e-07, "loss": 0.0, "reward": 3.856250047683716, "reward_std": 0.3475147485733032, "rewards/accuracy_reward": 2.5562500953674316, "rewards/format_reward": 1.0, "step": 168, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 295.34375, "epoch": 0.25880551301684535, "grad_norm": 5.114773777833558, "kl": 0.030517578125, "learning_rate": 8.436389005675324e-07, "loss": 0.0, "reward": 3.737499952316284, "reward_std": 0.6522217988967896, "rewards/accuracy_reward": 2.4375, "rewards/format_reward": 1.0, "step": 169, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 288.25, "epoch": 0.26033690658499237, "grad_norm": 7.109534980092122, "kl": 0.03515625, "learning_rate": 8.418875826459919e-07, "loss": 0.0, "reward": 3.9000000953674316, "reward_std": 0.40476852655410767, "rewards/accuracy_reward": 2.5999999046325684, "rewards/format_reward": 1.0, "step": 170, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 307.28125, "epoch": 0.26186830015313933, "grad_norm": 4.142263790273646, "kl": 0.033935546875, "learning_rate": 8.401283514564815e-07, "loss": 0.0, "reward": 3.5562498569488525, "reward_std": 0.41218000650405884, "rewards/accuracy_reward": 2.2562499046325684, "rewards/format_reward": 1.0, "step": 171, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 348.0625, "epoch": 0.26339969372128635, "grad_norm": 9.922744313017875, "kl": 0.0281982421875, "learning_rate": 8.383612477178464e-07, "loss": 0.0, "reward": 2.8625001907348633, "reward_std": 0.5244588851928711, "rewards/accuracy_reward": 1.6375000476837158, "rewards/format_reward": 1.0, "step": 172, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 287.40625, "epoch": 0.2649310872894334, "grad_norm": 4.091456334995777, "kl": 0.0341796875, "learning_rate": 8.365863123311497e-07, "loss": 0.0, "reward": 2.6312499046325684, "reward_std": 0.1589444875717163, "rewards/accuracy_reward": 1.40625, "rewards/format_reward": 1.0, "step": 173, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 283.09375, "epoch": 0.2664624808575804, "grad_norm": 4.940813443369881, "kl": 0.033203125, "learning_rate": 8.348035863787237e-07, "loss": 0.0, "reward": 3.46875, "reward_std": 0.2943500876426697, "rewards/accuracy_reward": 2.168750047683716, "rewards/format_reward": 1.0, "step": 174, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 257.03125, "epoch": 0.2679938744257274, "grad_norm": 6.586326309088319, "kl": 0.033203125, "learning_rate": 8.330131111232201e-07, "loss": 0.0, "reward": 4.03125, "reward_std": 0.6235748529434204, "rewards/accuracy_reward": 2.8062498569488525, "rewards/format_reward": 1.0, "step": 175, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 290.8125, "epoch": 0.26952526799387444, "grad_norm": 6.164855709585961, "kl": 0.03564453125, "learning_rate": 8.312149280066542e-07, "loss": 0.0, "reward": 3.612499713897705, "reward_std": 0.36475399136543274, "rewards/accuracy_reward": 2.3125, "rewards/format_reward": 1.0, "step": 176, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 297.90625, "epoch": 0.27105666156202146, "grad_norm": 12.646560626155859, "kl": 0.0306396484375, "learning_rate": 8.294090786494463e-07, "loss": 0.0, "reward": 3.674999952316284, "reward_std": 0.4887303411960602, "rewards/accuracy_reward": 2.375, "rewards/format_reward": 1.0, "step": 177, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 305.96875, "epoch": 0.2725880551301684, "grad_norm": 10.184859910871095, "kl": 0.031494140625, "learning_rate": 8.275956048494579e-07, "loss": 0.0, "reward": 3.518749952316284, "reward_std": 0.5078155398368835, "rewards/accuracy_reward": 2.293750047683716, "rewards/format_reward": 1.0, "step": 178, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 290.0, "epoch": 0.27411944869831545, "grad_norm": 15.727667383001375, "kl": 0.031005859375, "learning_rate": 8.257745485810249e-07, "loss": 0.0, "reward": 3.875, "reward_std": 0.5017106533050537, "rewards/accuracy_reward": 2.575000047683716, "rewards/format_reward": 1.0, "step": 179, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 307.75, "epoch": 0.27565084226646247, "grad_norm": 3.1981726512133464, "kl": 0.03564453125, "learning_rate": 8.239459519939851e-07, "loss": 0.0, "reward": 3.28125, "reward_std": 0.3276711106300354, "rewards/accuracy_reward": 2.0562500953674316, "rewards/format_reward": 1.0, "step": 180, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 322.71875, "epoch": 0.2771822358346095, "grad_norm": 13.088209195159147, "kl": 0.0306396484375, "learning_rate": 8.221098574127035e-07, "loss": 0.0, "reward": 3.8062500953674316, "reward_std": 0.7018867135047913, "rewards/accuracy_reward": 2.5812501907348633, "rewards/format_reward": 1.0, "step": 181, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 327.125, "epoch": 0.2787136294027565, "grad_norm": 5.994825764520661, "kl": 0.032958984375, "learning_rate": 8.202663073350921e-07, "loss": 0.0, "reward": 3.4124999046325684, "reward_std": 0.49544626474380493, "rewards/accuracy_reward": 2.112499952316284, "rewards/format_reward": 1.0, "step": 182, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 298.78125, "epoch": 0.28024502297090353, "grad_norm": 6.843693897252769, "kl": 0.034912109375, "learning_rate": 8.184153444316269e-07, "loss": 0.0, "reward": 3.59375, "reward_std": 0.4298456907272339, "rewards/accuracy_reward": 2.293750047683716, "rewards/format_reward": 1.0, "step": 183, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 359.34375, "epoch": 0.28177641653905056, "grad_norm": 5.53671809463008, "kl": 0.02978515625, "learning_rate": 8.165570115443592e-07, "loss": 0.0, "reward": 3.5062499046325684, "reward_std": 0.4097582697868347, "rewards/accuracy_reward": 2.206249952316284, "rewards/format_reward": 1.0, "step": 184, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 319.4375, "epoch": 0.2833078101071976, "grad_norm": 5.028956492644307, "kl": 0.030029296875, "learning_rate": 8.14691351685925e-07, "loss": 0.0, "reward": 2.981250047683716, "reward_std": 0.20719552040100098, "rewards/accuracy_reward": 1.681249976158142, "rewards/format_reward": 1.0, "step": 185, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 332.28125, "epoch": 0.28483920367534454, "grad_norm": 14.870145066767432, "kl": 0.03076171875, "learning_rate": 8.12818408038549e-07, "loss": 0.0, "reward": 2.887500047683716, "reward_std": 0.3261798322200775, "rewards/accuracy_reward": 1.662500023841858, "rewards/format_reward": 1.0, "step": 186, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 241.71875, "epoch": 0.28637059724349156, "grad_norm": 7.261751728440713, "kl": 0.03369140625, "learning_rate": 8.109382239530451e-07, "loss": 0.0, "reward": 3.5500001907348633, "reward_std": 0.6096799373626709, "rewards/accuracy_reward": 2.25, "rewards/format_reward": 1.0, "step": 187, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 255.0, "epoch": 0.2879019908116386, "grad_norm": 9.417177086278453, "kl": 0.037841796875, "learning_rate": 8.090508429478129e-07, "loss": 0.0, "reward": 3.612499952316284, "reward_std": 0.589754581451416, "rewards/accuracy_reward": 2.3125, "rewards/format_reward": 1.0, "step": 188, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 326.84375, "epoch": 0.2894333843797856, "grad_norm": 12.28812335790627, "kl": 0.032958984375, "learning_rate": 8.07156308707831e-07, "loss": 0.0, "reward": 3.4625000953674316, "reward_std": 0.4379882514476776, "rewards/accuracy_reward": 2.1624999046325684, "rewards/format_reward": 1.0, "step": 189, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 296.5, "epoch": 0.29096477794793263, "grad_norm": 8.95424949929567, "kl": 0.029296875, "learning_rate": 8.052546650836453e-07, "loss": 0.0, "reward": 3.987499952316284, "reward_std": 0.4414653480052948, "rewards/accuracy_reward": 2.6875, "rewards/format_reward": 1.0, "step": 190, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 289.1875, "epoch": 0.29249617151607965, "grad_norm": 16.95609593725675, "kl": 0.034423828125, "learning_rate": 8.033459560903539e-07, "loss": 0.0, "reward": 3.3812499046325684, "reward_std": 0.32076090574264526, "rewards/accuracy_reward": 2.081249952316284, "rewards/format_reward": 1.0, "step": 191, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 289.15625, "epoch": 0.29402756508422667, "grad_norm": 7.171670881715125, "kl": 0.03076171875, "learning_rate": 8.014302259065892e-07, "loss": 0.0, "reward": 3.325000047683716, "reward_std": 0.5578684210777283, "rewards/accuracy_reward": 2.0250000953674316, "rewards/format_reward": 1.0, "step": 192, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 275.40625, "epoch": 0.29555895865237364, "grad_norm": 7.072965681156847, "kl": 0.03271484375, "learning_rate": 7.995075188734946e-07, "loss": 0.0, "reward": 3.6312499046325684, "reward_std": 0.2462092638015747, "rewards/accuracy_reward": 2.331249952316284, "rewards/format_reward": 1.0, "step": 193, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 361.96875, "epoch": 0.29709035222052066, "grad_norm": 5.763751054733127, "kl": 0.025634765625, "learning_rate": 7.975778794936978e-07, "loss": 0.0, "reward": 3.4562501907348633, "reward_std": 0.33579856157302856, "rewards/accuracy_reward": 2.231250047683716, "rewards/format_reward": 1.0, "step": 194, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 282.84375, "epoch": 0.2986217457886677, "grad_norm": 5.903003459098319, "kl": 0.0294189453125, "learning_rate": 7.956413524302823e-07, "loss": 0.0, "reward": 3.40625, "reward_std": 0.4575210213661194, "rewards/accuracy_reward": 2.1812500953674316, "rewards/format_reward": 1.0, "step": 195, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 244.40625, "epoch": 0.3001531393568147, "grad_norm": 23.64457466042576, "kl": 0.042236328125, "learning_rate": 7.93697982505752e-07, "loss": 0.0, "reward": 2.8500001430511475, "reward_std": 0.33267366886138916, "rewards/accuracy_reward": 1.5499999523162842, "rewards/format_reward": 1.0, "step": 196, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 268.59375, "epoch": 0.3016845329249617, "grad_norm": 9.620171967989569, "kl": 0.03515625, "learning_rate": 7.917478147009949e-07, "loss": 0.0, "reward": 3.518749713897705, "reward_std": 0.4448161721229553, "rewards/accuracy_reward": 2.3687500953674316, "rewards/format_reward": 1.0, "step": 197, "temporal_rewards": 0.5 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 331.03125, "epoch": 0.30321592649310875, "grad_norm": 6.234928506561371, "kl": 0.036376953125, "learning_rate": 7.89790894154241e-07, "loss": 0.0, "reward": 3.2437500953674316, "reward_std": 0.4087636470794678, "rewards/accuracy_reward": 2.018749952316284, "rewards/format_reward": 1.0, "step": 198, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 296.15625, "epoch": 0.30474732006125577, "grad_norm": 9.715940321636149, "kl": 0.0361328125, "learning_rate": 7.878272661600185e-07, "loss": 0.0, "reward": 3.5875000953674316, "reward_std": 0.2343311309814453, "rewards/accuracy_reward": 2.2874999046325684, "rewards/format_reward": 1.0, "step": 199, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 322.125, "epoch": 0.30627871362940273, "grad_norm": 5.672785055414545, "kl": 0.031494140625, "learning_rate": 7.858569761681047e-07, "loss": 0.0, "reward": 2.875, "reward_std": 0.3694216012954712, "rewards/accuracy_reward": 1.5749999284744263, "rewards/format_reward": 1.0, "step": 200, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 328.4375, "epoch": 0.30781010719754975, "grad_norm": 6.123192750528283, "kl": 0.03271484375, "learning_rate": 7.838800697824743e-07, "loss": 0.0, "reward": 3.0625, "reward_std": 0.516420304775238, "rewards/accuracy_reward": 1.7625000476837158, "rewards/format_reward": 1.0, "step": 201, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 291.34375, "epoch": 0.3093415007656968, "grad_norm": 9.512927288541976, "kl": 0.03173828125, "learning_rate": 7.818965927602436e-07, "loss": 0.0, "reward": 4.099999904632568, "reward_std": 0.5014769434928894, "rewards/accuracy_reward": 2.799999952316284, "rewards/format_reward": 1.0, "step": 202, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 315.75, "epoch": 0.3108728943338438, "grad_norm": 5.366517575068269, "kl": 0.031005859375, "learning_rate": 7.799065910106126e-07, "loss": 0.0, "reward": 3.3937501907348633, "reward_std": 0.5237241983413696, "rewards/accuracy_reward": 2.093750238418579, "rewards/format_reward": 1.0, "step": 203, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 271.90625, "epoch": 0.3124042879019908, "grad_norm": 12.699080352281662, "kl": 0.04345703125, "learning_rate": 7.779101105938004e-07, "loss": 0.0, "reward": 3.8375000953674316, "reward_std": 0.4205377995967865, "rewards/accuracy_reward": 2.5375001430511475, "rewards/format_reward": 1.0, "step": 204, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 331.4375, "epoch": 0.31393568147013784, "grad_norm": 7.7656380006752155, "kl": 0.035888671875, "learning_rate": 7.759071977199806e-07, "loss": 0.0, "reward": 3.4749999046325684, "reward_std": 0.35376298427581787, "rewards/accuracy_reward": 2.174999952316284, "rewards/format_reward": 1.0, "step": 205, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 312.46875, "epoch": 0.31546707503828486, "grad_norm": 4.391130785851663, "kl": 0.0341796875, "learning_rate": 7.738978987482112e-07, "loss": 0.0, "reward": 3.5, "reward_std": 0.4183518886566162, "rewards/accuracy_reward": 2.1999998092651367, "rewards/format_reward": 1.0, "step": 206, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 245.40625, "epoch": 0.3169984686064318, "grad_norm": 5.540719912300597, "kl": 0.03955078125, "learning_rate": 7.71882260185362e-07, "loss": 0.0, "reward": 3.40625, "reward_std": 0.31065988540649414, "rewards/accuracy_reward": 2.1062498092651367, "rewards/format_reward": 1.0, "step": 207, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 313.1875, "epoch": 0.31852986217457885, "grad_norm": 17.420886036265625, "kl": 0.03515625, "learning_rate": 7.698603286850374e-07, "loss": 0.0, "reward": 3.8500001430511475, "reward_std": 0.30487963557243347, "rewards/accuracy_reward": 2.549999952316284, "rewards/format_reward": 1.0, "step": 208, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 256.6875, "epoch": 0.32006125574272587, "grad_norm": 7.933847200772637, "kl": 0.0458984375, "learning_rate": 7.678321510464971e-07, "loss": 0.0, "reward": 3.856250047683716, "reward_std": 0.29905903339385986, "rewards/accuracy_reward": 2.5562500953674316, "rewards/format_reward": 1.0, "step": 209, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 336.65625, "epoch": 0.3215926493108729, "grad_norm": 9.297095513483827, "kl": 0.034912109375, "learning_rate": 7.657977742135725e-07, "loss": 0.0, "reward": 2.9124999046325684, "reward_std": 0.4157797396183014, "rewards/accuracy_reward": 1.6124999523162842, "rewards/format_reward": 1.0, "step": 210, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 302.03125, "epoch": 0.3231240428790199, "grad_norm": 5.541167825333942, "kl": 0.036376953125, "learning_rate": 7.637572452735813e-07, "loss": 0.0, "reward": 3.7750000953674316, "reward_std": 0.49859824776649475, "rewards/accuracy_reward": 2.4749999046325684, "rewards/format_reward": 1.0, "step": 211, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 255.40625, "epoch": 0.32465543644716693, "grad_norm": 9.168998819803033, "kl": 0.04052734375, "learning_rate": 7.617106114562359e-07, "loss": 0.0, "reward": 4.243749618530273, "reward_std": 0.4166516363620758, "rewards/accuracy_reward": 2.9437501430511475, "rewards/format_reward": 1.0, "step": 212, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 264.71875, "epoch": 0.32618683001531396, "grad_norm": 9.531857908625076, "kl": 0.042236328125, "learning_rate": 7.596579201325515e-07, "loss": 0.0, "reward": 3.4749999046325684, "reward_std": 0.377795547246933, "rewards/accuracy_reward": 2.174999952316284, "rewards/format_reward": 1.0, "step": 213, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 326.0625, "epoch": 0.327718223583461, "grad_norm": 8.31464966245884, "kl": 0.041015625, "learning_rate": 7.57599218813749e-07, "loss": 0.0, "reward": 3.6500000953674316, "reward_std": 0.378510445356369, "rewards/accuracy_reward": 2.3500001430511475, "rewards/format_reward": 1.0, "step": 214, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 286.34375, "epoch": 0.32924961715160794, "grad_norm": 4.287495478359939, "kl": 0.0380859375, "learning_rate": 7.555345551501557e-07, "loss": 0.0, "reward": 3.1812498569488525, "reward_std": 0.38055357336997986, "rewards/accuracy_reward": 1.881250023841858, "rewards/format_reward": 1.0, "step": 215, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 324.65625, "epoch": 0.33078101071975496, "grad_norm": 15.056876642486671, "kl": 0.03662109375, "learning_rate": 7.534639769301024e-07, "loss": 0.0, "reward": 4.268750190734863, "reward_std": 0.2899988889694214, "rewards/accuracy_reward": 2.96875, "rewards/format_reward": 1.0, "step": 216, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 244.90625, "epoch": 0.332312404287902, "grad_norm": 9.069801680915958, "kl": 0.04345703125, "learning_rate": 7.513875320788165e-07, "loss": 0.0, "reward": 4.099999904632568, "reward_std": 0.3825719654560089, "rewards/accuracy_reward": 2.799999713897705, "rewards/format_reward": 1.0, "step": 217, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 305.1875, "epoch": 0.333843797856049, "grad_norm": 10.955229211982534, "kl": 0.04052734375, "learning_rate": 7.493052686573147e-07, "loss": 0.0, "reward": 3.731250047683716, "reward_std": 0.6039197444915771, "rewards/accuracy_reward": 2.4312500953674316, "rewards/format_reward": 1.0, "step": 218, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 273.15625, "epoch": 0.33537519142419603, "grad_norm": 5.926525708351621, "kl": 0.039794921875, "learning_rate": 7.472172348612876e-07, "loss": 0.0, "reward": 4.28125, "reward_std": 0.6196683049201965, "rewards/accuracy_reward": 2.9812498092651367, "rewards/format_reward": 1.0, "step": 219, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 353.5625, "epoch": 0.33690658499234305, "grad_norm": 7.464320668891594, "kl": 0.035400390625, "learning_rate": 7.451234790199871e-07, "loss": 0.0, "reward": 3.049999952316284, "reward_std": 0.4355461001396179, "rewards/accuracy_reward": 1.75, "rewards/format_reward": 1.0, "step": 220, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 361.40625, "epoch": 0.33843797856049007, "grad_norm": 12.12432567991499, "kl": 0.036865234375, "learning_rate": 7.430240495951062e-07, "loss": 0.0, "reward": 2.9812498092651367, "reward_std": 0.4996665120124817, "rewards/accuracy_reward": 1.7562501430511475, "rewards/format_reward": 1.0, "step": 221, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 274.78125, "epoch": 0.33996937212863704, "grad_norm": 5.359147879862915, "kl": 0.042236328125, "learning_rate": 7.409189951796574e-07, "loss": 0.0, "reward": 3.4187498092651367, "reward_std": 0.6283072233200073, "rewards/accuracy_reward": 2.1937501430511475, "rewards/format_reward": 1.0, "step": 222, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 283.03125, "epoch": 0.34150076569678406, "grad_norm": 12.760200628929272, "kl": 0.05078125, "learning_rate": 7.388083644968481e-07, "loss": 0.0001, "reward": 3.90625, "reward_std": 0.5293543338775635, "rewards/accuracy_reward": 2.6062498092651367, "rewards/format_reward": 1.0, "step": 223, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 351.90625, "epoch": 0.3430321592649311, "grad_norm": 5.413212423004192, "kl": 0.037109375, "learning_rate": 7.366922063989535e-07, "loss": 0.0, "reward": 3.793750047683716, "reward_std": 0.37553951144218445, "rewards/accuracy_reward": 2.4937500953674316, "rewards/format_reward": 1.0, "step": 224, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 282.875, "epoch": 0.3445635528330781, "grad_norm": 10.152321573196087, "kl": 0.044921875, "learning_rate": 7.345705698661852e-07, "loss": 0.0, "reward": 3.393749713897705, "reward_std": 0.518860936164856, "rewards/accuracy_reward": 2.168750047683716, "rewards/format_reward": 1.0, "step": 225, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 280.65625, "epoch": 0.3460949464012251, "grad_norm": 6.901158008710162, "kl": 0.04345703125, "learning_rate": 7.324435040055571e-07, "loss": 0.0, "reward": 3.3812499046325684, "reward_std": 0.4390850067138672, "rewards/accuracy_reward": 2.081249952316284, "rewards/format_reward": 1.0, "step": 226, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 348.625, "epoch": 0.34762633996937214, "grad_norm": 4.907181453263694, "kl": 0.033935546875, "learning_rate": 7.303110580497501e-07, "loss": 0.0, "reward": 3.012500047683716, "reward_std": 0.3685019910335541, "rewards/accuracy_reward": 1.787500023841858, "rewards/format_reward": 1.0, "step": 227, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 288.0, "epoch": 0.34915773353751917, "grad_norm": 8.5707111135805, "kl": 0.0556640625, "learning_rate": 7.281732813559713e-07, "loss": 0.0001, "reward": 3.3187499046325684, "reward_std": 0.5146178007125854, "rewards/accuracy_reward": 2.018749952316284, "rewards/format_reward": 1.0, "step": 228, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 287.5, "epoch": 0.35068912710566613, "grad_norm": 11.27891495949733, "kl": 0.042236328125, "learning_rate": 7.260302234048125e-07, "loss": 0.0, "reward": 3.575000047683716, "reward_std": 0.3730372190475464, "rewards/accuracy_reward": 2.2750000953674316, "rewards/format_reward": 1.0, "step": 229, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 293.9375, "epoch": 0.35222052067381315, "grad_norm": 19.476901011082536, "kl": 0.04150390625, "learning_rate": 7.23881933799104e-07, "loss": 0.0, "reward": 3.924999952316284, "reward_std": 0.5146142244338989, "rewards/accuracy_reward": 2.625, "rewards/format_reward": 1.0, "step": 230, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 307.25, "epoch": 0.3537519142419602, "grad_norm": 6.55022815298153, "kl": 0.044921875, "learning_rate": 7.217284622627674e-07, "loss": 0.0, "reward": 3.968749761581421, "reward_std": 0.6387063264846802, "rewards/accuracy_reward": 2.668750286102295, "rewards/format_reward": 1.0, "step": 231, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 330.0625, "epoch": 0.3552833078101072, "grad_norm": 11.903816051107185, "kl": 0.041259765625, "learning_rate": 7.195698586396645e-07, "loss": 0.0, "reward": 2.6374998092651367, "reward_std": 0.28559741377830505, "rewards/accuracy_reward": 1.412500023841858, "rewards/format_reward": 1.0, "step": 232, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 318.21875, "epoch": 0.3568147013782542, "grad_norm": 7.891011771111216, "kl": 0.04052734375, "learning_rate": 7.174061728924428e-07, "loss": 0.0, "reward": 3.1499998569488525, "reward_std": 0.5327916145324707, "rewards/accuracy_reward": 1.8499999046325684, "rewards/format_reward": 1.0, "step": 233, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 304.3125, "epoch": 0.35834609494640124, "grad_norm": 14.640401008280097, "kl": 0.03955078125, "learning_rate": 7.152374551013804e-07, "loss": 0.0, "reward": 3.3499999046325684, "reward_std": 0.5537967681884766, "rewards/accuracy_reward": 2.049999952316284, "rewards/format_reward": 1.0, "step": 234, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 305.40625, "epoch": 0.35987748851454826, "grad_norm": 4.292297714421554, "kl": 0.04443359375, "learning_rate": 7.130637554632257e-07, "loss": 0.0, "reward": 2.9937498569488525, "reward_std": 0.1590990275144577, "rewards/accuracy_reward": 1.6937499046325684, "rewards/format_reward": 1.0, "step": 235, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 333.28125, "epoch": 0.3614088820826952, "grad_norm": 36.0396195163604, "kl": 0.038330078125, "learning_rate": 7.108851242900364e-07, "loss": 0.0, "reward": 2.9250001907348633, "reward_std": 0.4584749937057495, "rewards/accuracy_reward": 1.625, "rewards/format_reward": 1.0, "step": 236, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 280.28125, "epoch": 0.36294027565084225, "grad_norm": 10.543504458410956, "kl": 0.048095703125, "learning_rate": 7.087016120080145e-07, "loss": 0.0, "reward": 3.687499761581421, "reward_std": 0.4067375659942627, "rewards/accuracy_reward": 2.387500047683716, "rewards/format_reward": 1.0, "step": 237, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 279.4375, "epoch": 0.36447166921898927, "grad_norm": 12.020212644340816, "kl": 0.05517578125, "learning_rate": 7.065132691563388e-07, "loss": 0.0001, "reward": 4.162500381469727, "reward_std": 0.6134676933288574, "rewards/accuracy_reward": 2.937500238418579, "rewards/format_reward": 1.0, "step": 238, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 281.03125, "epoch": 0.3660030627871363, "grad_norm": 6.445427231037523, "kl": 0.04638671875, "learning_rate": 7.043201463859963e-07, "loss": 0.0, "reward": 4.006249904632568, "reward_std": 0.39609289169311523, "rewards/accuracy_reward": 2.706249952316284, "rewards/format_reward": 1.0, "step": 239, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 282.4375, "epoch": 0.3675344563552833, "grad_norm": 12.36139816672738, "kl": 0.04443359375, "learning_rate": 7.021222944586088e-07, "loss": 0.0, "reward": 3.0812501907348633, "reward_std": 0.21749000251293182, "rewards/accuracy_reward": 1.78125, "rewards/format_reward": 1.0, "step": 240, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 305.96875, "epoch": 0.36906584992343033, "grad_norm": 46.70492156510754, "kl": 0.04248046875, "learning_rate": 6.999197642452583e-07, "loss": 0.0, "reward": 3.53125, "reward_std": 0.339226096868515, "rewards/accuracy_reward": 2.231250047683716, "rewards/format_reward": 1.0, "step": 241, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 300.75, "epoch": 0.37059724349157736, "grad_norm": 5.008941210138734, "kl": 0.044189453125, "learning_rate": 6.977126067253095e-07, "loss": 0.0, "reward": 3.143749952316284, "reward_std": 0.5060650110244751, "rewards/accuracy_reward": 1.8437498807907104, "rewards/format_reward": 1.0, "step": 242, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 255.09375, "epoch": 0.3721286370597244, "grad_norm": 7.851328072810821, "kl": 0.04931640625, "learning_rate": 6.9550087298523e-07, "loss": 0.0, "reward": 3.9749999046325684, "reward_std": 0.39178720116615295, "rewards/accuracy_reward": 2.6750001907348633, "rewards/format_reward": 1.0, "step": 243, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 332.6875, "epoch": 0.37366003062787134, "grad_norm": 15.778103385034653, "kl": 0.04052734375, "learning_rate": 6.93284614217408e-07, "loss": 0.0, "reward": 2.856250047683716, "reward_std": 0.3301621079444885, "rewards/accuracy_reward": 1.5562498569488525, "rewards/format_reward": 1.0, "step": 244, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 272.21875, "epoch": 0.37519142419601836, "grad_norm": 11.303882433021103, "kl": 0.0458984375, "learning_rate": 6.910638817189664e-07, "loss": 0.0, "reward": 3.8062500953674316, "reward_std": 0.46951526403427124, "rewards/accuracy_reward": 2.5062499046325684, "rewards/format_reward": 1.0, "step": 245, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 334.25, "epoch": 0.3767228177641654, "grad_norm": 10.26008198653885, "kl": 0.04296875, "learning_rate": 6.888387268905773e-07, "loss": 0.0, "reward": 3.0749998092651367, "reward_std": 0.4924160838127136, "rewards/accuracy_reward": 1.7750000953674316, "rewards/format_reward": 1.0, "step": 246, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 351.40625, "epoch": 0.3782542113323124, "grad_norm": 31.96761888686875, "kl": 0.0380859375, "learning_rate": 6.866092012352705e-07, "loss": 0.0, "reward": 3.331249952316284, "reward_std": 0.6083425283432007, "rewards/accuracy_reward": 2.03125, "rewards/format_reward": 1.0, "step": 247, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 298.28125, "epoch": 0.37978560490045943, "grad_norm": 6.5775361765625595, "kl": 0.04345703125, "learning_rate": 6.843753563572423e-07, "loss": 0.0, "reward": 3.9000000953674316, "reward_std": 0.659449577331543, "rewards/accuracy_reward": 2.674999952316284, "rewards/format_reward": 1.0, "step": 248, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 361.53125, "epoch": 0.38131699846860645, "grad_norm": 6.36601489337084, "kl": 0.0400390625, "learning_rate": 6.821372439606611e-07, "loss": 0.0, "reward": 2.9499998092651367, "reward_std": 0.4043930172920227, "rewards/accuracy_reward": 1.6500000953674316, "rewards/format_reward": 1.0, "step": 249, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 318.0, "epoch": 0.38284839203675347, "grad_norm": 3.1943067429090424, "kl": 0.04541015625, "learning_rate": 6.798949158484705e-07, "loss": 0.0, "reward": 3.0249998569488525, "reward_std": 0.38181358575820923, "rewards/accuracy_reward": 1.7999999523162842, "rewards/format_reward": 1.0, "step": 250, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 302.09375, "epoch": 0.38437978560490044, "grad_norm": 5.713129404217661, "kl": 0.046630859375, "learning_rate": 6.776484239211903e-07, "loss": 0.0, "reward": 3.268749952316284, "reward_std": 0.26601967215538025, "rewards/accuracy_reward": 1.96875, "rewards/format_reward": 1.0, "step": 251, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 321.125, "epoch": 0.38591117917304746, "grad_norm": 6.335821394393776, "kl": 0.052490234375, "learning_rate": 6.753978201757149e-07, "loss": 0.0001, "reward": 3.456249952316284, "reward_std": 0.3842371702194214, "rewards/accuracy_reward": 2.15625, "rewards/format_reward": 1.0, "step": 252, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 276.71875, "epoch": 0.3874425727411945, "grad_norm": 8.679817303939133, "kl": 0.046875, "learning_rate": 6.731431567041106e-07, "loss": 0.0, "reward": 3.7562499046325684, "reward_std": 0.5177336931228638, "rewards/accuracy_reward": 2.456249952316284, "rewards/format_reward": 1.0, "step": 253, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 263.9375, "epoch": 0.3889739663093415, "grad_norm": 5.808317301193834, "kl": 0.04833984375, "learning_rate": 6.708844856924088e-07, "loss": 0.0, "reward": 4.125, "reward_std": 0.4449988007545471, "rewards/accuracy_reward": 2.9000000953674316, "rewards/format_reward": 1.0, "step": 254, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 341.1875, "epoch": 0.3905053598774885, "grad_norm": 8.34282922172143, "kl": 0.041015625, "learning_rate": 6.686218594193993e-07, "loss": 0.0, "reward": 3.1062498092651367, "reward_std": 0.3678450584411621, "rewards/accuracy_reward": 1.8062500953674316, "rewards/format_reward": 1.0, "step": 255, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 285.0, "epoch": 0.39203675344563554, "grad_norm": 6.993127416049526, "kl": 0.053955078125, "learning_rate": 6.663553302554193e-07, "loss": 0.0001, "reward": 3.6812500953674316, "reward_std": 0.26855015754699707, "rewards/accuracy_reward": 2.4562501907348633, "rewards/format_reward": 1.0, "step": 256, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 320.6875, "epoch": 0.39356814701378257, "grad_norm": 21.057020289534563, "kl": 0.04638671875, "learning_rate": 6.640849506611417e-07, "loss": 0.0, "reward": 3.9124999046325684, "reward_std": 0.4169827699661255, "rewards/accuracy_reward": 2.6125001907348633, "rewards/format_reward": 1.0, "step": 257, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 304.1875, "epoch": 0.39509954058192953, "grad_norm": 7.14663466029652, "kl": 0.044677734375, "learning_rate": 6.618107731863608e-07, "loss": 0.0, "reward": 4.231250286102295, "reward_std": 0.5161499977111816, "rewards/accuracy_reward": 2.9312500953674316, "rewards/format_reward": 1.0, "step": 258, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 341.25, "epoch": 0.39663093415007655, "grad_norm": 9.2743529009992, "kl": 0.04150390625, "learning_rate": 6.595328504687757e-07, "loss": 0.0, "reward": 3.231250047683716, "reward_std": 0.40922269225120544, "rewards/accuracy_reward": 2.0062499046325684, "rewards/format_reward": 1.0, "step": 259, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 326.375, "epoch": 0.3981623277182236, "grad_norm": 4.79212098522223, "kl": 0.03955078125, "learning_rate": 6.572512352327726e-07, "loss": 0.0, "reward": 3.6687498092651367, "reward_std": 0.3076220154762268, "rewards/accuracy_reward": 2.3687500953674316, "rewards/format_reward": 1.0, "step": 260, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 302.53125, "epoch": 0.3996937212863706, "grad_norm": 16.65575325809638, "kl": 0.0478515625, "learning_rate": 6.549659802882038e-07, "loss": 0.0, "reward": 3.3812501430511475, "reward_std": 0.3832942843437195, "rewards/accuracy_reward": 2.081249952316284, "rewards/format_reward": 1.0, "step": 261, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 297.375, "epoch": 0.4012251148545176, "grad_norm": 6.152326499680604, "kl": 0.04443359375, "learning_rate": 6.526771385291656e-07, "loss": 0.0, "reward": 3.84375, "reward_std": 0.38622021675109863, "rewards/accuracy_reward": 2.5437498092651367, "rewards/format_reward": 1.0, "step": 262, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 282.40625, "epoch": 0.40275650842266464, "grad_norm": 10.417553039855553, "kl": 0.050048828125, "learning_rate": 6.503847629327744e-07, "loss": 0.0, "reward": 4.006250381469727, "reward_std": 0.583352267742157, "rewards/accuracy_reward": 2.7062501907348633, "rewards/format_reward": 1.0, "step": 263, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 331.0, "epoch": 0.40428790199081166, "grad_norm": 3.669963233981849, "kl": 0.045166015625, "learning_rate": 6.480889065579398e-07, "loss": 0.0, "reward": 2.7249999046325684, "reward_std": 0.2972213923931122, "rewards/accuracy_reward": 1.4250000715255737, "rewards/format_reward": 1.0, "step": 264, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 259.90625, "epoch": 0.4058192955589586, "grad_norm": 4.88282000130352, "kl": 0.0556640625, "learning_rate": 6.457896225441371e-07, "loss": 0.0001, "reward": 3.1875, "reward_std": 0.2949331998825073, "rewards/accuracy_reward": 1.962499976158142, "rewards/format_reward": 1.0, "step": 265, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 265.125, "epoch": 0.40735068912710565, "grad_norm": 22.004499913124683, "kl": 0.046875, "learning_rate": 6.434869641101768e-07, "loss": 0.0, "reward": 3.950000047683716, "reward_std": 0.6295482516288757, "rewards/accuracy_reward": 2.6500000953674316, "rewards/format_reward": 1.0, "step": 266, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 293.8125, "epoch": 0.40888208269525267, "grad_norm": 6.6840837429320485, "kl": 0.050048828125, "learning_rate": 6.411809845529734e-07, "loss": 0.0, "reward": 3.893749713897705, "reward_std": 0.5396034717559814, "rewards/accuracy_reward": 2.668750047683716, "rewards/format_reward": 1.0, "step": 267, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 286.5625, "epoch": 0.4104134762633997, "grad_norm": 7.143023845506581, "kl": 0.05029296875, "learning_rate": 6.388717372463115e-07, "loss": 0.0001, "reward": 3.731250047683716, "reward_std": 0.3467658758163452, "rewards/accuracy_reward": 2.4312498569488525, "rewards/format_reward": 1.0, "step": 268, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 298.4375, "epoch": 0.4119448698315467, "grad_norm": 12.633736339498368, "kl": 0.05224609375, "learning_rate": 6.365592756396101e-07, "loss": 0.0001, "reward": 2.9250001907348633, "reward_std": 0.4054605960845947, "rewards/accuracy_reward": 1.6250001192092896, "rewards/format_reward": 1.0, "step": 269, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 297.40625, "epoch": 0.41347626339969373, "grad_norm": 4.355690686839267, "kl": 0.05078125, "learning_rate": 6.342436532566865e-07, "loss": 0.0001, "reward": 3.5749998092651367, "reward_std": 0.2884190082550049, "rewards/accuracy_reward": 2.2750000953674316, "rewards/format_reward": 1.0, "step": 270, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 298.34375, "epoch": 0.41500765696784075, "grad_norm": 5.447937890611708, "kl": 0.04736328125, "learning_rate": 6.319249236945161e-07, "loss": 0.0, "reward": 3.674999713897705, "reward_std": 0.5889295339584351, "rewards/accuracy_reward": 2.450000047683716, "rewards/format_reward": 1.0, "step": 271, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 255.6875, "epoch": 0.4165390505359878, "grad_norm": 15.587262473448611, "kl": 0.05419921875, "learning_rate": 6.296031406219926e-07, "loss": 0.0001, "reward": 3.5562500953674316, "reward_std": 0.4689219295978546, "rewards/accuracy_reward": 2.2562499046325684, "rewards/format_reward": 1.0, "step": 272, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 314.78125, "epoch": 0.41807044410413474, "grad_norm": 5.16002952322231, "kl": 0.052978515625, "learning_rate": 6.272783577786862e-07, "loss": 0.0001, "reward": 3.4562501907348633, "reward_std": 0.5224058628082275, "rewards/accuracy_reward": 2.15625, "rewards/format_reward": 1.0, "step": 273, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 311.28125, "epoch": 0.41960183767228176, "grad_norm": 15.23123318426172, "kl": 0.047119140625, "learning_rate": 6.249506289735984e-07, "loss": 0.0, "reward": 3.299999952316284, "reward_std": 0.44371646642684937, "rewards/accuracy_reward": 2.0, "rewards/format_reward": 1.0, "step": 274, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 326.875, "epoch": 0.4211332312404288, "grad_norm": 6.07627204223583, "kl": 0.043212890625, "learning_rate": 6.226200080839182e-07, "loss": 0.0, "reward": 4.193749904632568, "reward_std": 0.41577982902526855, "rewards/accuracy_reward": 2.8937501907348633, "rewards/format_reward": 1.0, "step": 275, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 304.46875, "epoch": 0.4226646248085758, "grad_norm": 3.736918599404993, "kl": 0.05078125, "learning_rate": 6.202865490537739e-07, "loss": 0.0001, "reward": 3.531249761581421, "reward_std": 0.3116908073425293, "rewards/accuracy_reward": 2.231250047683716, "rewards/format_reward": 1.0, "step": 276, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 284.71875, "epoch": 0.4241960183767228, "grad_norm": 37.59291744193968, "kl": 0.05029296875, "learning_rate": 6.179503058929849e-07, "loss": 0.0, "reward": 3.6687498092651367, "reward_std": 0.4390817880630493, "rewards/accuracy_reward": 2.3687500953674316, "rewards/format_reward": 1.0, "step": 277, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 336.9375, "epoch": 0.42572741194486985, "grad_norm": 6.1493970390526105, "kl": 0.044677734375, "learning_rate": 6.156113326758118e-07, "loss": 0.0, "reward": 3.5687499046325684, "reward_std": 0.3175239861011505, "rewards/accuracy_reward": 2.268749952316284, "rewards/format_reward": 1.0, "step": 278, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 270.46875, "epoch": 0.42725880551301687, "grad_norm": 4.579166257528243, "kl": 0.043701171875, "learning_rate": 6.132696835397038e-07, "loss": 0.0, "reward": 3.5999999046325684, "reward_std": 0.3979983329772949, "rewards/accuracy_reward": 2.299999952316284, "rewards/format_reward": 1.0, "step": 279, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 290.9375, "epoch": 0.42879019908116384, "grad_norm": 11.69894583370795, "kl": 0.052978515625, "learning_rate": 6.109254126840479e-07, "loss": 0.0001, "reward": 2.9124999046325684, "reward_std": 0.40907424688339233, "rewards/accuracy_reward": 1.6875, "rewards/format_reward": 1.0, "step": 280, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 291.75, "epoch": 0.43032159264931086, "grad_norm": 4.623021086674321, "kl": 0.04345703125, "learning_rate": 6.085785743689113e-07, "loss": 0.0, "reward": 3.46875, "reward_std": 0.39231395721435547, "rewards/accuracy_reward": 2.2437498569488525, "rewards/format_reward": 1.0, "step": 281, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 313.28125, "epoch": 0.4318529862174579, "grad_norm": 6.303506543948684, "kl": 0.048095703125, "learning_rate": 6.062292229137885e-07, "loss": 0.0, "reward": 3.3687500953674316, "reward_std": 0.46298372745513916, "rewards/accuracy_reward": 2.0687499046325684, "rewards/format_reward": 1.0, "step": 282, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 264.5, "epoch": 0.4333843797856049, "grad_norm": 13.679405544800808, "kl": 0.04833984375, "learning_rate": 6.038774126963416e-07, "loss": 0.0, "reward": 4.356250286102295, "reward_std": 0.37773382663726807, "rewards/accuracy_reward": 3.0562498569488525, "rewards/format_reward": 1.0, "step": 283, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 262.21875, "epoch": 0.4349157733537519, "grad_norm": 14.30067593344122, "kl": 0.055908203125, "learning_rate": 6.015231981511439e-07, "loss": 0.0001, "reward": 3.5749998092651367, "reward_std": 0.29896894097328186, "rewards/accuracy_reward": 2.2749998569488525, "rewards/format_reward": 1.0, "step": 284, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 265.78125, "epoch": 0.43644716692189894, "grad_norm": 7.190634169332564, "kl": 0.05908203125, "learning_rate": 5.991666337684176e-07, "loss": 0.0001, "reward": 4.068750381469727, "reward_std": 0.6775935292243958, "rewards/accuracy_reward": 2.84375, "rewards/format_reward": 1.0, "step": 285, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 263.59375, "epoch": 0.43797856049004597, "grad_norm": 6.305888489386578, "kl": 0.051513671875, "learning_rate": 5.968077740927748e-07, "loss": 0.0001, "reward": 3.53125, "reward_std": 0.5097336769104004, "rewards/accuracy_reward": 2.231250047683716, "rewards/format_reward": 1.0, "step": 286, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 349.59375, "epoch": 0.43950995405819293, "grad_norm": 8.02368291703908, "kl": 0.03857421875, "learning_rate": 5.944466737219536e-07, "loss": 0.0, "reward": 3.393749952316284, "reward_std": 0.4041438102722168, "rewards/accuracy_reward": 2.09375, "rewards/format_reward": 1.0, "step": 287, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 322.9375, "epoch": 0.44104134762633995, "grad_norm": 3.7358796421010387, "kl": 0.04443359375, "learning_rate": 5.920833873055546e-07, "loss": 0.0, "reward": 3.65625, "reward_std": 0.33253127336502075, "rewards/accuracy_reward": 2.4312500953674316, "rewards/format_reward": 1.0, "step": 288, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 289.84375, "epoch": 0.442572741194487, "grad_norm": 5.746987770848019, "kl": 0.052001953125, "learning_rate": 5.89717969543777e-07, "loss": 0.0001, "reward": 3.46875, "reward_std": 0.4438609480857849, "rewards/accuracy_reward": 2.168750047683716, "rewards/format_reward": 1.0, "step": 289, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 282.3125, "epoch": 0.444104134762634, "grad_norm": 14.501094390969634, "kl": 0.05712890625, "learning_rate": 5.873504751861507e-07, "loss": 0.0001, "reward": 3.731250047683716, "reward_std": 0.41762256622314453, "rewards/accuracy_reward": 2.4312500953674316, "rewards/format_reward": 1.0, "step": 290, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 266.1875, "epoch": 0.445635528330781, "grad_norm": 5.515171753260493, "kl": 0.055419921875, "learning_rate": 5.849809590302712e-07, "loss": 0.0001, "reward": 3.71875, "reward_std": 0.46692323684692383, "rewards/accuracy_reward": 2.418750047683716, "rewards/format_reward": 1.0, "step": 291, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 242.8125, "epoch": 0.44716692189892804, "grad_norm": 17.253150012005655, "kl": 0.0517578125, "learning_rate": 5.826094759205293e-07, "loss": 0.0001, "reward": 4.137499809265137, "reward_std": 0.41192424297332764, "rewards/accuracy_reward": 2.8375000953674316, "rewards/format_reward": 1.0, "step": 292, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 307.90625, "epoch": 0.44869831546707506, "grad_norm": 6.706786195251721, "kl": 0.04931640625, "learning_rate": 5.802360807468427e-07, "loss": 0.0, "reward": 3.3249998092651367, "reward_std": 0.3380519151687622, "rewards/accuracy_reward": 2.0999999046325684, "rewards/format_reward": 1.0, "step": 293, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 310.625, "epoch": 0.450229709035222, "grad_norm": 8.494514075528341, "kl": 0.047119140625, "learning_rate": 5.778608284433862e-07, "loss": 0.0, "reward": 3.325000047683716, "reward_std": 0.48659753799438477, "rewards/accuracy_reward": 2.0999999046325684, "rewards/format_reward": 1.0, "step": 294, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 350.125, "epoch": 0.45176110260336905, "grad_norm": 5.536688254453823, "kl": 0.04296875, "learning_rate": 5.754837739873178e-07, "loss": 0.0, "reward": 2.856250047683716, "reward_std": 0.25302496552467346, "rewards/accuracy_reward": 1.7062499523162842, "rewards/format_reward": 1.0, "step": 295, "temporal_rewards": 0.5 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 337.0, "epoch": 0.45329249617151607, "grad_norm": 4.692889551300066, "kl": 0.0458984375, "learning_rate": 5.731049723975096e-07, "loss": 0.0, "reward": 3.25, "reward_std": 0.5270546674728394, "rewards/accuracy_reward": 2.0250000953674316, "rewards/format_reward": 1.0, "step": 296, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 288.34375, "epoch": 0.4548238897396631, "grad_norm": 6.310747002522716, "kl": 0.0458984375, "learning_rate": 5.707244787332711e-07, "loss": 0.0, "reward": 3.643749952316284, "reward_std": 0.46277567744255066, "rewards/accuracy_reward": 2.343749761581421, "rewards/format_reward": 1.0, "step": 297, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 332.96875, "epoch": 0.4563552833078101, "grad_norm": 7.199689739207391, "kl": 0.04541015625, "learning_rate": 5.683423480930774e-07, "loss": 0.0, "reward": 3.6312499046325684, "reward_std": 0.42347651720046997, "rewards/accuracy_reward": 2.331249952316284, "rewards/format_reward": 1.0, "step": 298, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 296.21875, "epoch": 0.45788667687595713, "grad_norm": 11.877389622998688, "kl": 0.0458984375, "learning_rate": 5.659586356132917e-07, "loss": 0.0, "reward": 3.768749952316284, "reward_std": 0.5539591312408447, "rewards/accuracy_reward": 2.5437498092651367, "rewards/format_reward": 1.0, "step": 299, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 261.78125, "epoch": 0.45941807044410415, "grad_norm": 4.089729197678542, "kl": 0.04931640625, "learning_rate": 5.635733964668909e-07, "loss": 0.0, "reward": 3.71875, "reward_std": 0.40402692556381226, "rewards/accuracy_reward": 2.418750047683716, "rewards/format_reward": 1.0, "step": 300, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 269.84375, "epoch": 0.4609494640122512, "grad_norm": 8.83236964834344, "kl": 0.051513671875, "learning_rate": 5.611866858621874e-07, "loss": 0.0001, "reward": 3.9187498092651367, "reward_std": 0.31364038586616516, "rewards/accuracy_reward": 2.6187500953674316, "rewards/format_reward": 1.0, "step": 301, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 256.34375, "epoch": 0.46248085758039814, "grad_norm": 10.261742972214364, "kl": 0.05078125, "learning_rate": 5.587985590415523e-07, "loss": 0.0001, "reward": 4.306249618530273, "reward_std": 0.3834628462791443, "rewards/accuracy_reward": 3.0062499046325684, "rewards/format_reward": 1.0, "step": 302, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 296.9375, "epoch": 0.46401225114854516, "grad_norm": 10.77009600273013, "kl": 0.049072265625, "learning_rate": 5.564090712801355e-07, "loss": 0.0, "reward": 4.081250190734863, "reward_std": 0.5357294678688049, "rewards/accuracy_reward": 2.78125, "rewards/format_reward": 1.0, "step": 303, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 319.1875, "epoch": 0.4655436447166922, "grad_norm": 10.077594491655711, "kl": 0.047607421875, "learning_rate": 5.540182778845871e-07, "loss": 0.0, "reward": 3.25, "reward_std": 0.4022381901741028, "rewards/accuracy_reward": 2.0250000953674316, "rewards/format_reward": 1.0, "step": 304, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 273.46875, "epoch": 0.4670750382848392, "grad_norm": 23.880086686835266, "kl": 0.047119140625, "learning_rate": 5.516262341917778e-07, "loss": 0.0, "reward": 3.8312501907348633, "reward_std": 0.577486515045166, "rewards/accuracy_reward": 2.53125, "rewards/format_reward": 1.0, "step": 305, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 279.71875, "epoch": 0.4686064318529862, "grad_norm": 5.6859935156058725, "kl": 0.052978515625, "learning_rate": 5.492329955675166e-07, "loss": 0.0001, "reward": 3.481250047683716, "reward_std": 0.4928224980831146, "rewards/accuracy_reward": 2.2562499046325684, "rewards/format_reward": 1.0, "step": 306, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 308.375, "epoch": 0.47013782542113325, "grad_norm": 16.540677042156723, "kl": 0.0458984375, "learning_rate": 5.468386174052709e-07, "loss": 0.0, "reward": 3.4187498092651367, "reward_std": 0.4784466326236725, "rewards/accuracy_reward": 2.1187500953674316, "rewards/format_reward": 1.0, "step": 307, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 281.3125, "epoch": 0.47166921898928027, "grad_norm": 4.992857126648013, "kl": 0.050048828125, "learning_rate": 5.444431551248831e-07, "loss": 0.0, "reward": 3.450000047683716, "reward_std": 0.5468271970748901, "rewards/accuracy_reward": 2.1499998569488525, "rewards/format_reward": 1.0, "step": 308, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 350.28125, "epoch": 0.47320061255742724, "grad_norm": 7.387730644270765, "kl": 0.04345703125, "learning_rate": 5.420466641712886e-07, "loss": 0.0, "reward": 3.362499952316284, "reward_std": 0.44643062353134155, "rewards/accuracy_reward": 2.0625, "rewards/format_reward": 1.0, "step": 309, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 295.28125, "epoch": 0.47473200612557426, "grad_norm": 4.8298307691048175, "kl": 0.0517578125, "learning_rate": 5.396492000132325e-07, "loss": 0.0001, "reward": 3.28125, "reward_std": 0.6318193674087524, "rewards/accuracy_reward": 2.0562500953674316, "rewards/format_reward": 1.0, "step": 310, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 345.125, "epoch": 0.4762633996937213, "grad_norm": 4.827377648738647, "kl": 0.04833984375, "learning_rate": 5.372508181419851e-07, "loss": 0.0, "reward": 3.706249952316284, "reward_std": 0.4461830258369446, "rewards/accuracy_reward": 2.40625, "rewards/format_reward": 1.0, "step": 311, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 281.5, "epoch": 0.4777947932618683, "grad_norm": 6.296930290616834, "kl": 0.05078125, "learning_rate": 5.348515740700582e-07, "loss": 0.0001, "reward": 3.6624999046325684, "reward_std": 0.4589795470237732, "rewards/accuracy_reward": 2.362499952316284, "rewards/format_reward": 1.0, "step": 312, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 272.1875, "epoch": 0.4793261868300153, "grad_norm": 8.929722302544173, "kl": 0.0595703125, "learning_rate": 5.324515233299199e-07, "loss": 0.0001, "reward": 4.399999618530273, "reward_std": 0.37918606400489807, "rewards/accuracy_reward": 3.0999999046325684, "rewards/format_reward": 1.0, "step": 313, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 329.28125, "epoch": 0.48085758039816234, "grad_norm": 5.059035226884364, "kl": 0.04736328125, "learning_rate": 5.300507214727092e-07, "loss": 0.0, "reward": 3.0625, "reward_std": 0.4319705069065094, "rewards/accuracy_reward": 1.7624999284744263, "rewards/format_reward": 1.0, "step": 314, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 312.28125, "epoch": 0.48238897396630936, "grad_norm": 4.2357305471468045, "kl": 0.0517578125, "learning_rate": 5.276492240669503e-07, "loss": 0.0001, "reward": 3.4187498092651367, "reward_std": 0.5939319133758545, "rewards/accuracy_reward": 2.1187500953674316, "rewards/format_reward": 1.0, "step": 315, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 310.125, "epoch": 0.48392036753445633, "grad_norm": 5.052922851946215, "kl": 0.05029296875, "learning_rate": 5.252470866972668e-07, "loss": 0.0, "reward": 3.46875, "reward_std": 0.20276173949241638, "rewards/accuracy_reward": 2.2437498569488525, "rewards/format_reward": 1.0, "step": 316, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 287.53125, "epoch": 0.48545176110260335, "grad_norm": 14.798643962141302, "kl": 0.044921875, "learning_rate": 5.228443649630945e-07, "loss": 0.0, "reward": 4.143750190734863, "reward_std": 0.5091822743415833, "rewards/accuracy_reward": 2.84375, "rewards/format_reward": 1.0, "step": 317, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 286.1875, "epoch": 0.4869831546707504, "grad_norm": 3.342287669075317, "kl": 0.04541015625, "learning_rate": 5.204411144773944e-07, "loss": 0.0, "reward": 3.6812500953674316, "reward_std": 0.3361450731754303, "rewards/accuracy_reward": 2.3812496662139893, "rewards/format_reward": 1.0, "step": 318, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 295.25, "epoch": 0.4885145482388974, "grad_norm": 4.492148845651719, "kl": 0.05078125, "learning_rate": 5.180373908653667e-07, "loss": 0.0001, "reward": 3.2937498092651367, "reward_std": 0.43178924918174744, "rewards/accuracy_reward": 1.993749976158142, "rewards/format_reward": 1.0, "step": 319, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 278.25, "epoch": 0.4900459418070444, "grad_norm": 13.039966488350624, "kl": 0.04931640625, "learning_rate": 5.156332497631621e-07, "loss": 0.0, "reward": 3.456249952316284, "reward_std": 0.38298481702804565, "rewards/accuracy_reward": 2.15625, "rewards/format_reward": 1.0, "step": 320, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 279.875, "epoch": 0.49157733537519144, "grad_norm": 12.642622696044361, "kl": 0.055419921875, "learning_rate": 5.13228746816594e-07, "loss": 0.0001, "reward": 3.887500286102295, "reward_std": 0.4415132403373718, "rewards/accuracy_reward": 2.5874998569488525, "rewards/format_reward": 1.0, "step": 321, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 273.46875, "epoch": 0.49310872894333846, "grad_norm": 4.286766253570016, "kl": 0.053466796875, "learning_rate": 5.10823937679852e-07, "loss": 0.0001, "reward": 4.28125, "reward_std": 0.3476155996322632, "rewards/accuracy_reward": 3.0562498569488525, "rewards/format_reward": 1.0, "step": 322, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 331.03125, "epoch": 0.4946401225114854, "grad_norm": 5.229987914897258, "kl": 0.05224609375, "learning_rate": 5.084188780142118e-07, "loss": 0.0001, "reward": 3.9250001907348633, "reward_std": 0.6279107928276062, "rewards/accuracy_reward": 2.6999998092651367, "rewards/format_reward": 1.0, "step": 323, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 316.9375, "epoch": 0.49617151607963245, "grad_norm": 28.49902177125843, "kl": 0.05517578125, "learning_rate": 5.060136234867484e-07, "loss": 0.0001, "reward": 3.2312498092651367, "reward_std": 0.4152737259864807, "rewards/accuracy_reward": 1.9312498569488525, "rewards/format_reward": 1.0, "step": 324, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 289.59375, "epoch": 0.49770290964777947, "grad_norm": 7.1439008267200235, "kl": 0.051025390625, "learning_rate": 5.036082297690464e-07, "loss": 0.0001, "reward": 3.6312499046325684, "reward_std": 0.3471168279647827, "rewards/accuracy_reward": 2.331249952316284, "rewards/format_reward": 1.0, "step": 325, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 273.46875, "epoch": 0.4992343032159265, "grad_norm": 11.072867676579405, "kl": 0.046875, "learning_rate": 5.012027525359129e-07, "loss": 0.0, "reward": 3.5, "reward_std": 0.30619415640830994, "rewards/accuracy_reward": 2.200000047683716, "rewards/format_reward": 1.0, "step": 326, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 281.125, "epoch": 0.5007656967840735, "grad_norm": 21.889693980675588, "kl": 0.054443359375, "learning_rate": 4.987972474640873e-07, "loss": 0.0001, "reward": 4.412499904632568, "reward_std": 0.32305750250816345, "rewards/accuracy_reward": 3.112499713897705, "rewards/format_reward": 1.0, "step": 327, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 337.53125, "epoch": 0.5022970903522205, "grad_norm": 3.952527777748008, "kl": 0.045654296875, "learning_rate": 4.963917702309536e-07, "loss": 0.0, "reward": 3.8562498092651367, "reward_std": 0.39377397298812866, "rewards/accuracy_reward": 2.5562500953674316, "rewards/format_reward": 1.0, "step": 328, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 341.21875, "epoch": 0.5038284839203675, "grad_norm": 5.346886118235814, "kl": 0.047607421875, "learning_rate": 4.939863765132519e-07, "loss": 0.0, "reward": 2.9749999046325684, "reward_std": 0.42998963594436646, "rewards/accuracy_reward": 1.75, "rewards/format_reward": 1.0, "step": 329, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 276.375, "epoch": 0.5053598774885145, "grad_norm": 6.679344822369119, "kl": 0.049072265625, "learning_rate": 4.915811219857882e-07, "loss": 0.0, "reward": 3.7437500953674316, "reward_std": 0.5443640947341919, "rewards/accuracy_reward": 2.4437499046325684, "rewards/format_reward": 1.0, "step": 330, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 300.8125, "epoch": 0.5068912710566615, "grad_norm": 34.57022446692435, "kl": 0.047119140625, "learning_rate": 4.891760623201481e-07, "loss": 0.0, "reward": 3.65625, "reward_std": 0.5190573930740356, "rewards/accuracy_reward": 2.4312498569488525, "rewards/format_reward": 1.0, "step": 331, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 330.8125, "epoch": 0.5084226646248086, "grad_norm": 8.057551366527573, "kl": 0.0458984375, "learning_rate": 4.86771253183406e-07, "loss": 0.0, "reward": 3.9812498092651367, "reward_std": 0.4452638328075409, "rewards/accuracy_reward": 2.6812498569488525, "rewards/format_reward": 1.0, "step": 332, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 277.6875, "epoch": 0.5099540581929556, "grad_norm": 26.453879255218297, "kl": 0.0546875, "learning_rate": 4.84366750236838e-07, "loss": 0.0001, "reward": 3.8249998092651367, "reward_std": 0.46865373849868774, "rewards/accuracy_reward": 2.5250000953674316, "rewards/format_reward": 1.0, "step": 333, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 310.875, "epoch": 0.5114854517611026, "grad_norm": 4.902733023855285, "kl": 0.054443359375, "learning_rate": 4.819626091346333e-07, "loss": 0.0001, "reward": 3.0437498092651367, "reward_std": 0.2876538038253784, "rewards/accuracy_reward": 1.743749976158142, "rewards/format_reward": 1.0, "step": 334, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 290.28125, "epoch": 0.5130168453292496, "grad_norm": 58.51949600354415, "kl": 0.046142578125, "learning_rate": 4.795588855226055e-07, "loss": 0.0, "reward": 3.5437498092651367, "reward_std": 0.41970500349998474, "rewards/accuracy_reward": 2.2437500953674316, "rewards/format_reward": 1.0, "step": 335, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 267.0625, "epoch": 0.5145482388973966, "grad_norm": 5.658384395157894, "kl": 0.05126953125, "learning_rate": 4.771556350369056e-07, "loss": 0.0001, "reward": 4.050000190734863, "reward_std": 0.347625732421875, "rewards/accuracy_reward": 2.825000047683716, "rewards/format_reward": 1.0, "step": 336, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 305.5625, "epoch": 0.5160796324655437, "grad_norm": 10.144687298640607, "kl": 0.04638671875, "learning_rate": 4.7475291330273314e-07, "loss": 0.0, "reward": 4.212500095367432, "reward_std": 0.3171128034591675, "rewards/accuracy_reward": 2.9124999046325684, "rewards/format_reward": 1.0, "step": 337, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 308.96875, "epoch": 0.5176110260336907, "grad_norm": 6.512792003516793, "kl": 0.050048828125, "learning_rate": 4.7235077593304954e-07, "loss": 0.0001, "reward": 3.1687498092651367, "reward_std": 0.5290573835372925, "rewards/accuracy_reward": 2.018749952316284, "rewards/format_reward": 1.0, "step": 338, "temporal_rewards": 0.5 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 273.3125, "epoch": 0.5191424196018377, "grad_norm": 5.588824053582611, "kl": 0.04931640625, "learning_rate": 4.6994927852729085e-07, "loss": 0.0, "reward": 3.418750047683716, "reward_std": 0.39333152770996094, "rewards/accuracy_reward": 2.1937499046325684, "rewards/format_reward": 1.0, "step": 339, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 312.875, "epoch": 0.5206738131699847, "grad_norm": 15.184768822679787, "kl": 0.044189453125, "learning_rate": 4.6754847667008004e-07, "loss": 0.0, "reward": 4.112500190734863, "reward_std": 0.5456335544586182, "rewards/accuracy_reward": 2.8125, "rewards/format_reward": 1.0, "step": 340, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 348.65625, "epoch": 0.5222052067381318, "grad_norm": 6.794546630661044, "kl": 0.046630859375, "learning_rate": 4.6514842592994176e-07, "loss": 0.0, "reward": 3.862499952316284, "reward_std": 0.4044283926486969, "rewards/accuracy_reward": 2.5625, "rewards/format_reward": 1.0, "step": 341, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 268.4375, "epoch": 0.5237366003062787, "grad_norm": 7.093920182211075, "kl": 0.0546875, "learning_rate": 4.627491818580149e-07, "loss": 0.0001, "reward": 4.349999904632568, "reward_std": 0.43522968888282776, "rewards/accuracy_reward": 3.0500001907348633, "rewards/format_reward": 1.0, "step": 342, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 317.21875, "epoch": 0.5252679938744257, "grad_norm": 7.521140980470885, "kl": 0.052490234375, "learning_rate": 4.6035079998676755e-07, "loss": 0.0001, "reward": 3.8499999046325684, "reward_std": 0.3446093201637268, "rewards/accuracy_reward": 2.549999952316284, "rewards/format_reward": 1.0, "step": 343, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 300.375, "epoch": 0.5267993874425727, "grad_norm": 48.83516267703524, "kl": 0.05517578125, "learning_rate": 4.5795333582871133e-07, "loss": 0.0001, "reward": 3.549999952316284, "reward_std": 0.5778024196624756, "rewards/accuracy_reward": 2.25, "rewards/format_reward": 1.0, "step": 344, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 268.28125, "epoch": 0.5283307810107197, "grad_norm": 5.5659331458819015, "kl": 0.05859375, "learning_rate": 4.5555684487511693e-07, "loss": 0.0001, "reward": 4.050000190734863, "reward_std": 0.3617284893989563, "rewards/accuracy_reward": 2.75, "rewards/format_reward": 1.0, "step": 345, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 315.125, "epoch": 0.5298621745788668, "grad_norm": 8.836341008055525, "kl": 0.05810546875, "learning_rate": 4.5316138259472915e-07, "loss": 0.0001, "reward": 4.099999904632568, "reward_std": 0.3962700366973877, "rewards/accuracy_reward": 2.799999952316284, "rewards/format_reward": 1.0, "step": 346, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 281.40625, "epoch": 0.5313935681470138, "grad_norm": 6.7857182701070995, "kl": 0.060302734375, "learning_rate": 4.507670044324833e-07, "loss": 0.0001, "reward": 4.0625, "reward_std": 0.5822941064834595, "rewards/accuracy_reward": 2.762500047683716, "rewards/format_reward": 1.0, "step": 347, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 272.65625, "epoch": 0.5329249617151608, "grad_norm": 21.70038600264373, "kl": 0.052001953125, "learning_rate": 4.483737658082223e-07, "loss": 0.0001, "reward": 3.5875000953674316, "reward_std": 0.5105255842208862, "rewards/accuracy_reward": 2.362499952316284, "rewards/format_reward": 1.0, "step": 348, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 309.6875, "epoch": 0.5344563552833078, "grad_norm": 8.03758245428532, "kl": 0.0537109375, "learning_rate": 4.459817221154129e-07, "loss": 0.0001, "reward": 3.5187501907348633, "reward_std": 0.49836862087249756, "rewards/accuracy_reward": 2.21875, "rewards/format_reward": 1.0, "step": 349, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 294.71875, "epoch": 0.5359877488514548, "grad_norm": 26.176585081247165, "kl": 0.05419921875, "learning_rate": 4.435909287198646e-07, "loss": 0.0001, "reward": 3.3312501907348633, "reward_std": 0.25713980197906494, "rewards/accuracy_reward": 2.1812500953674316, "rewards/format_reward": 1.0, "step": 350, "temporal_rewards": 0.5 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 363.59375, "epoch": 0.5375191424196019, "grad_norm": 4.189754049368454, "kl": 0.050048828125, "learning_rate": 4.4120144095844773e-07, "loss": 0.0, "reward": 3.03125, "reward_std": 0.27744585275650024, "rewards/accuracy_reward": 1.806249976158142, "rewards/format_reward": 1.0, "step": 351, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 267.625, "epoch": 0.5390505359877489, "grad_norm": 5.923836672176543, "kl": 0.0546875, "learning_rate": 4.3881331413781247e-07, "loss": 0.0001, "reward": 3.987499713897705, "reward_std": 0.5389834046363831, "rewards/accuracy_reward": 2.6875, "rewards/format_reward": 1.0, "step": 352, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 267.5625, "epoch": 0.5405819295558959, "grad_norm": 24.676473640178664, "kl": 0.05615234375, "learning_rate": 4.364266035331091e-07, "loss": 0.0001, "reward": 4.131249904632568, "reward_std": 0.5211171507835388, "rewards/accuracy_reward": 2.90625, "rewards/format_reward": 1.0, "step": 353, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 286.75, "epoch": 0.5421133231240429, "grad_norm": 5.219576885609563, "kl": 0.05859375, "learning_rate": 4.340413643867083e-07, "loss": 0.0001, "reward": 3.3687500953674316, "reward_std": 0.5183711647987366, "rewards/accuracy_reward": 2.1437501907348633, "rewards/format_reward": 1.0, "step": 354, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 293.78125, "epoch": 0.5436447166921899, "grad_norm": 8.935709493601047, "kl": 0.0498046875, "learning_rate": 4.316576519069226e-07, "loss": 0.0, "reward": 3.8562498092651367, "reward_std": 0.37087583541870117, "rewards/accuracy_reward": 2.5562498569488525, "rewards/format_reward": 1.0, "step": 355, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 330.53125, "epoch": 0.5451761102603369, "grad_norm": 8.829545099860825, "kl": 0.05810546875, "learning_rate": 4.2927552126672887e-07, "loss": 0.0001, "reward": 3.6812498569488525, "reward_std": 0.4220343232154846, "rewards/accuracy_reward": 2.3812499046325684, "rewards/format_reward": 1.0, "step": 356, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 278.65625, "epoch": 0.5467075038284839, "grad_norm": 6.219717164837179, "kl": 0.057861328125, "learning_rate": 4.2689502760249057e-07, "loss": 0.0001, "reward": 3.418750047683716, "reward_std": 0.44134071469306946, "rewards/accuracy_reward": 2.1187500953674316, "rewards/format_reward": 1.0, "step": 357, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 353.75, "epoch": 0.5482388973966309, "grad_norm": 5.556766815792866, "kl": 0.05029296875, "learning_rate": 4.245162260126823e-07, "loss": 0.0001, "reward": 3.625, "reward_std": 0.4050983190536499, "rewards/accuracy_reward": 2.3249998092651367, "rewards/format_reward": 1.0, "step": 358, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 344.8125, "epoch": 0.5497702909647779, "grad_norm": 3.676604375125933, "kl": 0.05078125, "learning_rate": 4.2213917155661405e-07, "loss": 0.0001, "reward": 2.8499999046325684, "reward_std": 0.29664111137390137, "rewards/accuracy_reward": 1.625, "rewards/format_reward": 1.0, "step": 359, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 291.8125, "epoch": 0.5513016845329249, "grad_norm": 6.86923911826066, "kl": 0.06005859375, "learning_rate": 4.197639192531573e-07, "loss": 0.0001, "reward": 3.606250047683716, "reward_std": 0.3868841528892517, "rewards/accuracy_reward": 2.3062500953674316, "rewards/format_reward": 1.0, "step": 360, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 312.5, "epoch": 0.552833078101072, "grad_norm": 10.259674346863441, "kl": 0.056884765625, "learning_rate": 4.1739052407947075e-07, "loss": 0.0001, "reward": 3.1999998092651367, "reward_std": 0.24718201160430908, "rewards/accuracy_reward": 1.975000023841858, "rewards/format_reward": 1.0, "step": 361, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 322.84375, "epoch": 0.554364471669219, "grad_norm": 5.8426688029571485, "kl": 0.05517578125, "learning_rate": 4.150190409697288e-07, "loss": 0.0001, "reward": 4.425000190734863, "reward_std": 0.3833528757095337, "rewards/accuracy_reward": 3.125, "rewards/format_reward": 1.0, "step": 362, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 309.9375, "epoch": 0.555895865237366, "grad_norm": 5.259636552958595, "kl": 0.053466796875, "learning_rate": 4.126495248138492e-07, "loss": 0.0001, "reward": 4.081250190734863, "reward_std": 0.38400453329086304, "rewards/accuracy_reward": 2.781250238418579, "rewards/format_reward": 1.0, "step": 363, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 329.0, "epoch": 0.557427258805513, "grad_norm": 6.60110747813211, "kl": 0.054931640625, "learning_rate": 4.10282030456223e-07, "loss": 0.0001, "reward": 3.062499761581421, "reward_std": 0.5393983721733093, "rewards/accuracy_reward": 1.837499976158142, "rewards/format_reward": 1.0, "step": 364, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 299.03125, "epoch": 0.55895865237366, "grad_norm": 4.253141502103047, "kl": 0.054931640625, "learning_rate": 4.079166126944453e-07, "loss": 0.0001, "reward": 4.524999618530273, "reward_std": 0.1969119757413864, "rewards/accuracy_reward": 3.2249999046325684, "rewards/format_reward": 1.0, "step": 365, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 311.625, "epoch": 0.5604900459418071, "grad_norm": 6.796068903920509, "kl": 0.05126953125, "learning_rate": 4.055533262780464e-07, "loss": 0.0001, "reward": 2.768749952316284, "reward_std": 0.2892647087574005, "rewards/accuracy_reward": 1.46875, "rewards/format_reward": 1.0, "step": 366, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 311.21875, "epoch": 0.5620214395099541, "grad_norm": 7.758597041925383, "kl": 0.0537109375, "learning_rate": 4.031922259072252e-07, "loss": 0.0001, "reward": 3.7750000953674316, "reward_std": 0.529064953327179, "rewards/accuracy_reward": 2.4749999046325684, "rewards/format_reward": 1.0, "step": 367, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 282.59375, "epoch": 0.5635528330781011, "grad_norm": 44.275697473879234, "kl": 0.06298828125, "learning_rate": 4.0083336623158236e-07, "loss": 0.0001, "reward": 4.168749809265137, "reward_std": 0.43980512022972107, "rewards/accuracy_reward": 2.9437499046325684, "rewards/format_reward": 1.0, "step": 368, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 308.46875, "epoch": 0.5650842266462481, "grad_norm": 9.086010791037292, "kl": 0.059326171875, "learning_rate": 3.9847680184885613e-07, "loss": 0.0001, "reward": 4.131250381469727, "reward_std": 0.32010617852211, "rewards/accuracy_reward": 2.831249952316284, "rewards/format_reward": 1.0, "step": 369, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 308.0, "epoch": 0.5666156202143952, "grad_norm": 11.913129345112655, "kl": 0.04638671875, "learning_rate": 3.9612258730365823e-07, "loss": 0.0, "reward": 3.6187498569488525, "reward_std": 0.41490620374679565, "rewards/accuracy_reward": 2.3187499046325684, "rewards/format_reward": 1.0, "step": 370, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 337.5625, "epoch": 0.5681470137825421, "grad_norm": 6.288735778589191, "kl": 0.0546875, "learning_rate": 3.9377077708621167e-07, "loss": 0.0001, "reward": 3.6999998092651367, "reward_std": 0.41836118698120117, "rewards/accuracy_reward": 2.3999998569488525, "rewards/format_reward": 1.0, "step": 371, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 314.375, "epoch": 0.5696784073506891, "grad_norm": 9.996390173583332, "kl": 0.05712890625, "learning_rate": 3.914214256310887e-07, "loss": 0.0001, "reward": 3.3249998092651367, "reward_std": 0.7744901180267334, "rewards/accuracy_reward": 2.1000001430511475, "rewards/format_reward": 1.0, "step": 372, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 323.28125, "epoch": 0.5712098009188361, "grad_norm": 5.431898052655735, "kl": 0.060791015625, "learning_rate": 3.8907458731595223e-07, "loss": 0.0001, "reward": 3.200000047683716, "reward_std": 0.3381873369216919, "rewards/accuracy_reward": 1.899999976158142, "rewards/format_reward": 1.0, "step": 373, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 271.53125, "epoch": 0.5727411944869831, "grad_norm": 3.868776732403, "kl": 0.0634765625, "learning_rate": 3.867303164602961e-07, "loss": 0.0001, "reward": 3.875, "reward_std": 0.3560502529144287, "rewards/accuracy_reward": 2.6499998569488525, "rewards/format_reward": 1.0, "step": 374, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 298.96875, "epoch": 0.5742725880551302, "grad_norm": 6.940948762708857, "kl": 0.0615234375, "learning_rate": 3.843886673241883e-07, "loss": 0.0001, "reward": 3.7124998569488525, "reward_std": 0.4306986927986145, "rewards/accuracy_reward": 2.4124999046325684, "rewards/format_reward": 1.0, "step": 375, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 316.3125, "epoch": 0.5758039816232772, "grad_norm": 53.90165318932112, "kl": 0.0615234375, "learning_rate": 3.8204969410701505e-07, "loss": 0.0001, "reward": 4.125, "reward_std": 0.49573537707328796, "rewards/accuracy_reward": 2.8999998569488525, "rewards/format_reward": 1.0, "step": 376, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 326.34375, "epoch": 0.5773353751914242, "grad_norm": 4.124093408449859, "kl": 0.051513671875, "learning_rate": 3.797134509462261e-07, "loss": 0.0001, "reward": 3.7750000953674316, "reward_std": 0.30634891986846924, "rewards/accuracy_reward": 2.4749999046325684, "rewards/format_reward": 1.0, "step": 377, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 313.03125, "epoch": 0.5788667687595712, "grad_norm": 12.732241959905021, "kl": 0.05859375, "learning_rate": 3.773799919160817e-07, "loss": 0.0001, "reward": 3.5, "reward_std": 0.43782657384872437, "rewards/accuracy_reward": 2.200000047683716, "rewards/format_reward": 1.0, "step": 378, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 339.40625, "epoch": 0.5803981623277182, "grad_norm": 30.76472108606479, "kl": 0.055419921875, "learning_rate": 3.750493710264016e-07, "loss": 0.0001, "reward": 3.081249952316284, "reward_std": 0.2554660439491272, "rewards/accuracy_reward": 1.78125, "rewards/format_reward": 1.0, "step": 379, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 333.8125, "epoch": 0.5819295558958653, "grad_norm": 7.012893283715469, "kl": 0.052001953125, "learning_rate": 3.7272164222131387e-07, "loss": 0.0001, "reward": 3.731250047683716, "reward_std": 0.4654560089111328, "rewards/accuracy_reward": 2.4312500953674316, "rewards/format_reward": 1.0, "step": 380, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 331.09375, "epoch": 0.5834609494640123, "grad_norm": 7.483991726584263, "kl": 0.052978515625, "learning_rate": 3.703968593780074e-07, "loss": 0.0001, "reward": 3.937499761581421, "reward_std": 0.6161357760429382, "rewards/accuracy_reward": 2.637500047683716, "rewards/format_reward": 1.0, "step": 381, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 328.9375, "epoch": 0.5849923430321593, "grad_norm": 7.708594456755339, "kl": 0.04833984375, "learning_rate": 3.6807507630548394e-07, "loss": 0.0, "reward": 3.4187498092651367, "reward_std": 0.3879605531692505, "rewards/accuracy_reward": 2.2687501907348633, "rewards/format_reward": 1.0, "step": 382, "temporal_rewards": 0.5 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 338.21875, "epoch": 0.5865237366003063, "grad_norm": 7.932311484776607, "kl": 0.053955078125, "learning_rate": 3.657563467433134e-07, "loss": 0.0001, "reward": 3.518749713897705, "reward_std": 0.5936441421508789, "rewards/accuracy_reward": 2.21875, "rewards/format_reward": 1.0, "step": 383, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 279.375, "epoch": 0.5880551301684533, "grad_norm": 6.986880473373834, "kl": 0.0673828125, "learning_rate": 3.6344072436038976e-07, "loss": 0.0001, "reward": 5.175000190734863, "reward_std": 0.2715410590171814, "rewards/accuracy_reward": 3.874999761581421, "rewards/format_reward": 1.0, "step": 384, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 315.75, "epoch": 0.5895865237366003, "grad_norm": 8.544800046398805, "kl": 0.06103515625, "learning_rate": 3.611282627536887e-07, "loss": 0.0001, "reward": 4.256249904632568, "reward_std": 0.48312920331954956, "rewards/accuracy_reward": 2.956249952316284, "rewards/format_reward": 1.0, "step": 385, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 318.1875, "epoch": 0.5911179173047473, "grad_norm": 96.50426243207977, "kl": 0.06396484375, "learning_rate": 3.5881901544702673e-07, "loss": 0.0001, "reward": 3.856250047683716, "reward_std": 0.43018585443496704, "rewards/accuracy_reward": 2.5562498569488525, "rewards/format_reward": 1.0, "step": 386, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 284.0, "epoch": 0.5926493108728943, "grad_norm": 4.379736134724126, "kl": 0.0693359375, "learning_rate": 3.565130358898233e-07, "loss": 0.0001, "reward": 3.737499713897705, "reward_std": 0.45637887716293335, "rewards/accuracy_reward": 2.5124998092651367, "rewards/format_reward": 1.0, "step": 387, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 295.25, "epoch": 0.5941807044410413, "grad_norm": 19.658835383842874, "kl": 0.0634765625, "learning_rate": 3.54210377455863e-07, "loss": 0.0001, "reward": 3.9562501907348633, "reward_std": 0.4577101171016693, "rewards/accuracy_reward": 2.65625, "rewards/format_reward": 1.0, "step": 388, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 332.34375, "epoch": 0.5957120980091883, "grad_norm": 8.319861598093148, "kl": 0.05908203125, "learning_rate": 3.519110934420602e-07, "loss": 0.0001, "reward": 4.018750190734863, "reward_std": 0.49576184153556824, "rewards/accuracy_reward": 2.71875, "rewards/format_reward": 1.0, "step": 389, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 343.59375, "epoch": 0.5972434915773354, "grad_norm": 8.981467534724874, "kl": 0.05517578125, "learning_rate": 3.496152370672255e-07, "loss": 0.0001, "reward": 3.637500047683716, "reward_std": 0.5412981510162354, "rewards/accuracy_reward": 2.3375000953674316, "rewards/format_reward": 1.0, "step": 390, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 294.8125, "epoch": 0.5987748851454824, "grad_norm": 4.408603122067736, "kl": 0.0634765625, "learning_rate": 3.4732286147083435e-07, "loss": 0.0001, "reward": 4.600000381469727, "reward_std": 0.4068170189857483, "rewards/accuracy_reward": 3.375, "rewards/format_reward": 1.0, "step": 391, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 356.6875, "epoch": 0.6003062787136294, "grad_norm": 5.306185280043634, "kl": 0.051513671875, "learning_rate": 3.450340197117962e-07, "loss": 0.0001, "reward": 3.6500000953674316, "reward_std": 0.31280529499053955, "rewards/accuracy_reward": 2.3500001430511475, "rewards/format_reward": 1.0, "step": 392, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 318.65625, "epoch": 0.6018376722817764, "grad_norm": 19.013497776702604, "kl": 0.06298828125, "learning_rate": 3.427487647672274e-07, "loss": 0.0001, "reward": 3.7750000953674316, "reward_std": 0.4099277853965759, "rewards/accuracy_reward": 2.4749999046325684, "rewards/format_reward": 1.0, "step": 393, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 301.21875, "epoch": 0.6033690658499234, "grad_norm": 7.498947178310738, "kl": 0.07080078125, "learning_rate": 3.4046714953122435e-07, "loss": 0.0001, "reward": 3.9312498569488525, "reward_std": 0.3217979073524475, "rewards/accuracy_reward": 2.6312499046325684, "rewards/format_reward": 1.0, "step": 394, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 277.21875, "epoch": 0.6049004594180705, "grad_norm": 5.258483288911297, "kl": 0.0673828125, "learning_rate": 3.381892268136392e-07, "loss": 0.0001, "reward": 3.950000047683716, "reward_std": 0.4231208562850952, "rewards/accuracy_reward": 2.6500000953674316, "rewards/format_reward": 1.0, "step": 395, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 283.46875, "epoch": 0.6064318529862175, "grad_norm": 11.648109176660908, "kl": 0.06884765625, "learning_rate": 3.359150493388583e-07, "loss": 0.0001, "reward": 4.356249809265137, "reward_std": 0.43625977635383606, "rewards/accuracy_reward": 3.0562498569488525, "rewards/format_reward": 1.0, "step": 396, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 322.71875, "epoch": 0.6079632465543645, "grad_norm": 4.297923922438784, "kl": 0.0654296875, "learning_rate": 3.3364466974458056e-07, "loss": 0.0001, "reward": 3.96875, "reward_std": 0.3672224283218384, "rewards/accuracy_reward": 2.6687498092651367, "rewards/format_reward": 1.0, "step": 397, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 332.8125, "epoch": 0.6094946401225115, "grad_norm": 7.58076316133248, "kl": 0.05615234375, "learning_rate": 3.313781405806006e-07, "loss": 0.0001, "reward": 3.0874998569488525, "reward_std": 0.44267088174819946, "rewards/accuracy_reward": 1.787500023841858, "rewards/format_reward": 1.0, "step": 398, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 356.5625, "epoch": 0.6110260336906586, "grad_norm": 5.811198431695378, "kl": 0.0595703125, "learning_rate": 3.291155143075912e-07, "loss": 0.0001, "reward": 3.2750000953674316, "reward_std": 0.3249671161174774, "rewards/accuracy_reward": 1.9749999046325684, "rewards/format_reward": 1.0, "step": 399, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 297.8125, "epoch": 0.6125574272588055, "grad_norm": 6.320383107288802, "kl": 0.0595703125, "learning_rate": 3.2685684329588956e-07, "loss": 0.0001, "reward": 4.068749904632568, "reward_std": 0.4821315407752991, "rewards/accuracy_reward": 2.768749952316284, "rewards/format_reward": 1.0, "step": 400, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 286.09375, "epoch": 0.6140888208269525, "grad_norm": 7.273549949269243, "kl": 0.06201171875, "learning_rate": 3.2460217982428513e-07, "loss": 0.0001, "reward": 4.587500095367432, "reward_std": 0.5616779327392578, "rewards/accuracy_reward": 3.2875001430511475, "rewards/format_reward": 1.0, "step": 401, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 327.53125, "epoch": 0.6156202143950995, "grad_norm": 5.96611061635415, "kl": 0.056884765625, "learning_rate": 3.223515760788098e-07, "loss": 0.0001, "reward": 3.7437498569488525, "reward_std": 0.345234751701355, "rewards/accuracy_reward": 2.4437499046325684, "rewards/format_reward": 1.0, "step": 402, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 302.375, "epoch": 0.6171516079632465, "grad_norm": 13.750305659870863, "kl": 0.061279296875, "learning_rate": 3.2010508415152946e-07, "loss": 0.0001, "reward": 3.6125001907348633, "reward_std": 0.502585768699646, "rewards/accuracy_reward": 2.3125, "rewards/format_reward": 1.0, "step": 403, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 273.46875, "epoch": 0.6186830015313936, "grad_norm": 17.49120172103442, "kl": 0.06396484375, "learning_rate": 3.1786275603933886e-07, "loss": 0.0001, "reward": 3.46875, "reward_std": 0.2607581615447998, "rewards/accuracy_reward": 2.168750047683716, "rewards/format_reward": 1.0, "step": 404, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 330.09375, "epoch": 0.6202143950995406, "grad_norm": 11.160363536103876, "kl": 0.068359375, "learning_rate": 3.1562464364275774e-07, "loss": 0.0001, "reward": 4.081250190734863, "reward_std": 0.49857833981513977, "rewards/accuracy_reward": 2.78125, "rewards/format_reward": 1.0, "step": 405, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 320.03125, "epoch": 0.6217457886676876, "grad_norm": 16.135854626517443, "kl": 0.05712890625, "learning_rate": 3.133907987647295e-07, "loss": 0.0001, "reward": 2.843749761581421, "reward_std": 0.350276917219162, "rewards/accuracy_reward": 1.5437500476837158, "rewards/format_reward": 1.0, "step": 406, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 316.0, "epoch": 0.6232771822358346, "grad_norm": 8.558951314586174, "kl": 0.06103515625, "learning_rate": 3.1116127310942263e-07, "loss": 0.0001, "reward": 3.8687498569488525, "reward_std": 0.3773455321788788, "rewards/accuracy_reward": 2.5687499046325684, "rewards/format_reward": 1.0, "step": 407, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 295.0, "epoch": 0.6248085758039816, "grad_norm": 5.9580071935367345, "kl": 0.0625, "learning_rate": 3.089361182810335e-07, "loss": 0.0001, "reward": 3.0812501907348633, "reward_std": 0.28288936614990234, "rewards/accuracy_reward": 1.78125, "rewards/format_reward": 1.0, "step": 408, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 327.6875, "epoch": 0.6263399693721287, "grad_norm": 7.083464778039577, "kl": 0.06787109375, "learning_rate": 3.0671538578259203e-07, "loss": 0.0001, "reward": 3.65625, "reward_std": 0.4088667929172516, "rewards/accuracy_reward": 2.3562498092651367, "rewards/format_reward": 1.0, "step": 409, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 318.25, "epoch": 0.6278713629402757, "grad_norm": 4.587386738325241, "kl": 0.06494140625, "learning_rate": 3.044991270147699e-07, "loss": 0.0001, "reward": 3.706249952316284, "reward_std": 0.45332545042037964, "rewards/accuracy_reward": 2.406249761581421, "rewards/format_reward": 1.0, "step": 410, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 356.5625, "epoch": 0.6294027565084227, "grad_norm": 4.485073681832358, "kl": 0.05419921875, "learning_rate": 3.0228739327469046e-07, "loss": 0.0001, "reward": 3.6312499046325684, "reward_std": 0.539216935634613, "rewards/accuracy_reward": 2.331249952316284, "rewards/format_reward": 1.0, "step": 411, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 287.75, "epoch": 0.6309341500765697, "grad_norm": 9.519163207799423, "kl": 0.059326171875, "learning_rate": 3.000802357547417e-07, "loss": 0.0001, "reward": 3.9312498569488525, "reward_std": 0.4099936783313751, "rewards/accuracy_reward": 2.6312499046325684, "rewards/format_reward": 1.0, "step": 412, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 295.625, "epoch": 0.6324655436447167, "grad_norm": 9.958954384986654, "kl": 0.07080078125, "learning_rate": 2.978777055413911e-07, "loss": 0.0001, "reward": 3.0687499046325684, "reward_std": 0.13203126192092896, "rewards/accuracy_reward": 1.7687499523162842, "rewards/format_reward": 1.0, "step": 413, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 291.625, "epoch": 0.6339969372128637, "grad_norm": 6.745076028703393, "kl": 0.06591796875, "learning_rate": 2.9567985361400376e-07, "loss": 0.0001, "reward": 3.793750047683716, "reward_std": 0.5402119755744934, "rewards/accuracy_reward": 2.4937500953674316, "rewards/format_reward": 1.0, "step": 414, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 346.5, "epoch": 0.6355283307810107, "grad_norm": 5.744964983707697, "kl": 0.0625, "learning_rate": 2.934867308436613e-07, "loss": 0.0001, "reward": 4.387499809265137, "reward_std": 0.40273937582969666, "rewards/accuracy_reward": 3.0874998569488525, "rewards/format_reward": 1.0, "step": 415, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 316.28125, "epoch": 0.6370597243491577, "grad_norm": 40.656643318281574, "kl": 0.064453125, "learning_rate": 2.912983879919857e-07, "loss": 0.0001, "reward": 3.78125, "reward_std": 0.4405216574668884, "rewards/accuracy_reward": 2.481250047683716, "rewards/format_reward": 1.0, "step": 416, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 300.3125, "epoch": 0.6385911179173047, "grad_norm": 4.359120589772653, "kl": 0.0595703125, "learning_rate": 2.891148757099636e-07, "loss": 0.0001, "reward": 4.112500190734863, "reward_std": 0.46361613273620605, "rewards/accuracy_reward": 2.8125, "rewards/format_reward": 1.0, "step": 417, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 293.15625, "epoch": 0.6401225114854517, "grad_norm": 13.317110893017306, "kl": 0.06494140625, "learning_rate": 2.8693624453677434e-07, "loss": 0.0001, "reward": 3.9437499046325684, "reward_std": 0.5202068090438843, "rewards/accuracy_reward": 2.643749952316284, "rewards/format_reward": 1.0, "step": 418, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 310.03125, "epoch": 0.6416539050535988, "grad_norm": 4.914353351589785, "kl": 0.056884765625, "learning_rate": 2.847625448986196e-07, "loss": 0.0001, "reward": 4.299999713897705, "reward_std": 0.5695346593856812, "rewards/accuracy_reward": 2.999999761581421, "rewards/format_reward": 1.0, "step": 419, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 335.9375, "epoch": 0.6431852986217458, "grad_norm": 11.187489124968776, "kl": 0.0625, "learning_rate": 2.825938271075572e-07, "loss": 0.0001, "reward": 2.9749999046325684, "reward_std": 0.6231825947761536, "rewards/accuracy_reward": 1.75, "rewards/format_reward": 1.0, "step": 420, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 305.625, "epoch": 0.6447166921898928, "grad_norm": 10.59483786175929, "kl": 0.064453125, "learning_rate": 2.804301413603356e-07, "loss": 0.0001, "reward": 3.7437500953674316, "reward_std": 0.4473724365234375, "rewards/accuracy_reward": 2.4437499046325684, "rewards/format_reward": 1.0, "step": 421, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 301.96875, "epoch": 0.6462480857580398, "grad_norm": 8.851351539181564, "kl": 0.078125, "learning_rate": 2.782715377372326e-07, "loss": 0.0001, "reward": 4.03125, "reward_std": 0.1525237262248993, "rewards/accuracy_reward": 2.731250047683716, "rewards/format_reward": 1.0, "step": 422, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 324.34375, "epoch": 0.6477794793261868, "grad_norm": 7.808986838862892, "kl": 0.058349609375, "learning_rate": 2.761180662008961e-07, "loss": 0.0001, "reward": 3.5562500953674316, "reward_std": 0.6510157585144043, "rewards/accuracy_reward": 2.2562499046325684, "rewards/format_reward": 1.0, "step": 423, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 329.15625, "epoch": 0.6493108728943339, "grad_norm": 7.364051668334138, "kl": 0.0654296875, "learning_rate": 2.7396977659518744e-07, "loss": 0.0001, "reward": 3.7937498092651367, "reward_std": 0.37267881631851196, "rewards/accuracy_reward": 2.4937500953674316, "rewards/format_reward": 1.0, "step": 424, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 305.53125, "epoch": 0.6508422664624809, "grad_norm": 16.69692399722496, "kl": 0.0595703125, "learning_rate": 2.7182671864402856e-07, "loss": 0.0001, "reward": 4.074999809265137, "reward_std": 0.3937183916568756, "rewards/accuracy_reward": 2.7749998569488525, "rewards/format_reward": 1.0, "step": 425, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 301.59375, "epoch": 0.6523736600306279, "grad_norm": 5.936387236448735, "kl": 0.07470703125, "learning_rate": 2.6968894195024984e-07, "loss": 0.0001, "reward": 3.237499713897705, "reward_std": 0.22437289357185364, "rewards/accuracy_reward": 1.9375, "rewards/format_reward": 1.0, "step": 426, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 310.71875, "epoch": 0.6539050535987749, "grad_norm": 6.076679157286225, "kl": 0.068359375, "learning_rate": 2.6755649599444287e-07, "loss": 0.0001, "reward": 3.950000047683716, "reward_std": 0.6021788120269775, "rewards/accuracy_reward": 2.6500000953674316, "rewards/format_reward": 1.0, "step": 427, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 277.0625, "epoch": 0.655436447166922, "grad_norm": 4.482861061619246, "kl": 0.060546875, "learning_rate": 2.654294301338149e-07, "loss": 0.0001, "reward": 3.831249952316284, "reward_std": 0.39099207520484924, "rewards/accuracy_reward": 2.53125, "rewards/format_reward": 1.0, "step": 428, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 271.9375, "epoch": 0.6569678407350689, "grad_norm": 4.806567872654773, "kl": 0.080078125, "learning_rate": 2.633077936010465e-07, "loss": 0.0001, "reward": 2.9499998092651367, "reward_std": 0.2927432060241699, "rewards/accuracy_reward": 1.7999999523162842, "rewards/format_reward": 1.0, "step": 429, "temporal_rewards": 0.5 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 341.78125, "epoch": 0.6584992343032159, "grad_norm": 4.716096742653952, "kl": 0.060546875, "learning_rate": 2.6119163550315194e-07, "loss": 0.0001, "reward": 3.1875, "reward_std": 0.3171377182006836, "rewards/accuracy_reward": 1.8875000476837158, "rewards/format_reward": 1.0, "step": 430, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 327.25, "epoch": 0.6600306278713629, "grad_norm": 10.997674271653253, "kl": 0.060546875, "learning_rate": 2.590810048203428e-07, "loss": 0.0001, "reward": 3.90625, "reward_std": 0.3396279215812683, "rewards/accuracy_reward": 2.6812500953674316, "rewards/format_reward": 1.0, "step": 431, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 336.75, "epoch": 0.6615620214395099, "grad_norm": 6.121337508508913, "kl": 0.06494140625, "learning_rate": 2.5697595040489386e-07, "loss": 0.0001, "reward": 4.09375, "reward_std": 0.541084349155426, "rewards/accuracy_reward": 2.793750047683716, "rewards/format_reward": 1.0, "step": 432, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 295.03125, "epoch": 0.663093415007657, "grad_norm": 14.90890760110821, "kl": 0.06689453125, "learning_rate": 2.5487652098001267e-07, "loss": 0.0001, "reward": 3.512500047683716, "reward_std": 0.3367306590080261, "rewards/accuracy_reward": 2.2874999046325684, "rewards/format_reward": 1.0, "step": 433, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 329.4375, "epoch": 0.664624808575804, "grad_norm": 4.949525751331033, "kl": 0.06787109375, "learning_rate": 2.5278276513871233e-07, "loss": 0.0001, "reward": 3.6937501430511475, "reward_std": 0.27078691124916077, "rewards/accuracy_reward": 2.3937501907348633, "rewards/format_reward": 1.0, "step": 434, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 269.40625, "epoch": 0.666156202143951, "grad_norm": 5.943192955867212, "kl": 0.0732421875, "learning_rate": 2.506947313426854e-07, "loss": 0.0001, "reward": 4.125, "reward_std": 0.36376625299453735, "rewards/accuracy_reward": 2.825000047683716, "rewards/format_reward": 1.0, "step": 435, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 269.0, "epoch": 0.667687595712098, "grad_norm": 19.813187254646646, "kl": 0.06591796875, "learning_rate": 2.486124679211834e-07, "loss": 0.0001, "reward": 4.425000190734863, "reward_std": 0.4399248957633972, "rewards/accuracy_reward": 3.125, "rewards/format_reward": 1.0, "step": 436, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 258.03125, "epoch": 0.669218989280245, "grad_norm": 6.357727250158085, "kl": 0.07080078125, "learning_rate": 2.465360230698978e-07, "loss": 0.0001, "reward": 4.699999809265137, "reward_std": 0.3344690203666687, "rewards/accuracy_reward": 3.4000000953674316, "rewards/format_reward": 1.0, "step": 437, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 290.46875, "epoch": 0.6707503828483921, "grad_norm": 6.171421947220471, "kl": 0.072265625, "learning_rate": 2.444654448498442e-07, "loss": 0.0001, "reward": 3.6437501907348633, "reward_std": 0.37767261266708374, "rewards/accuracy_reward": 2.418750047683716, "rewards/format_reward": 1.0, "step": 438, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 322.4375, "epoch": 0.6722817764165391, "grad_norm": 7.171340223302997, "kl": 0.062255859375, "learning_rate": 2.42400781186251e-07, "loss": 0.0001, "reward": 3.6812500953674316, "reward_std": 0.5633392333984375, "rewards/accuracy_reward": 2.3812499046325684, "rewards/format_reward": 1.0, "step": 439, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 357.0, "epoch": 0.6738131699846861, "grad_norm": 9.313462256754683, "kl": 0.06201171875, "learning_rate": 2.4034207986744847e-07, "loss": 0.0001, "reward": 3.856250047683716, "reward_std": 0.3225916028022766, "rewards/accuracy_reward": 2.5562500953674316, "rewards/format_reward": 1.0, "step": 440, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 293.28125, "epoch": 0.6753445635528331, "grad_norm": 7.9431002089449505, "kl": 0.06103515625, "learning_rate": 2.3828938854376408e-07, "loss": 0.0001, "reward": 3.9812498092651367, "reward_std": 0.4964829087257385, "rewards/accuracy_reward": 2.6812498569488525, "rewards/format_reward": 1.0, "step": 441, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 305.375, "epoch": 0.6768759571209801, "grad_norm": 4.236738187315109, "kl": 0.0673828125, "learning_rate": 2.362427547264187e-07, "loss": 0.0001, "reward": 3.762500286102295, "reward_std": 0.3254941701889038, "rewards/accuracy_reward": 2.4625000953674316, "rewards/format_reward": 1.0, "step": 442, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 340.375, "epoch": 0.678407350689127, "grad_norm": 7.778748180413386, "kl": 0.06689453125, "learning_rate": 2.3420222578642747e-07, "loss": 0.0001, "reward": 2.90625, "reward_std": 0.2956770658493042, "rewards/accuracy_reward": 1.6062499284744263, "rewards/format_reward": 1.0, "step": 443, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 322.59375, "epoch": 0.6799387442572741, "grad_norm": 14.038766966652355, "kl": 0.072265625, "learning_rate": 2.321678489535031e-07, "loss": 0.0001, "reward": 3.6374998092651367, "reward_std": 0.5000779628753662, "rewards/accuracy_reward": 2.3375000953674316, "rewards/format_reward": 1.0, "step": 444, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 313.46875, "epoch": 0.6814701378254211, "grad_norm": 5.017466913183842, "kl": 0.068359375, "learning_rate": 2.301396713149627e-07, "loss": 0.0001, "reward": 3.59375, "reward_std": 0.47141778469085693, "rewards/accuracy_reward": 2.293750047683716, "rewards/format_reward": 1.0, "step": 445, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 367.71875, "epoch": 0.6830015313935681, "grad_norm": 12.681507283176876, "kl": 0.06396484375, "learning_rate": 2.2811773981463805e-07, "loss": 0.0001, "reward": 3.6312501430511475, "reward_std": 0.4401986598968506, "rewards/accuracy_reward": 2.331249952316284, "rewards/format_reward": 1.0, "step": 446, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 322.84375, "epoch": 0.6845329249617151, "grad_norm": 18.330301394432702, "kl": 0.06396484375, "learning_rate": 2.2610210125178863e-07, "loss": 0.0001, "reward": 3.1875, "reward_std": 0.43023771047592163, "rewards/accuracy_reward": 1.8875000476837158, "rewards/format_reward": 1.0, "step": 447, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 326.0, "epoch": 0.6860643185298622, "grad_norm": 17.894755319302757, "kl": 0.06689453125, "learning_rate": 2.2409280228001937e-07, "loss": 0.0001, "reward": 3.4375, "reward_std": 0.44989362359046936, "rewards/accuracy_reward": 2.2125000953674316, "rewards/format_reward": 1.0, "step": 448, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 318.0625, "epoch": 0.6875957120980092, "grad_norm": 6.212149538425316, "kl": 0.0693359375, "learning_rate": 2.220898894061996e-07, "loss": 0.0001, "reward": 3.8812501430511475, "reward_std": 0.5551595091819763, "rewards/accuracy_reward": 2.581249952316284, "rewards/format_reward": 1.0, "step": 449, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 364.53125, "epoch": 0.6891271056661562, "grad_norm": 68.47456626232325, "kl": 0.06298828125, "learning_rate": 2.2009340898938738e-07, "loss": 0.0001, "reward": 3.5625, "reward_std": 0.31192710995674133, "rewards/accuracy_reward": 2.2624998092651367, "rewards/format_reward": 1.0, "step": 450, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 351.0625, "epoch": 0.6906584992343032, "grad_norm": 10.434349640876311, "kl": 0.060302734375, "learning_rate": 2.1810340723975635e-07, "loss": 0.0001, "reward": 4.143749713897705, "reward_std": 0.34711429476737976, "rewards/accuracy_reward": 2.84375, "rewards/format_reward": 1.0, "step": 451, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 274.3125, "epoch": 0.6921898928024502, "grad_norm": 4.79448716086322, "kl": 0.07470703125, "learning_rate": 2.1611993021752589e-07, "loss": 0.0001, "reward": 3.7124998569488525, "reward_std": 0.3149541914463043, "rewards/accuracy_reward": 2.4124999046325684, "rewards/format_reward": 1.0, "step": 452, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 343.03125, "epoch": 0.6937212863705973, "grad_norm": 5.386247307227581, "kl": 0.0634765625, "learning_rate": 2.1414302383189524e-07, "loss": 0.0001, "reward": 3.9937498569488525, "reward_std": 0.507817268371582, "rewards/accuracy_reward": 2.6937499046325684, "rewards/format_reward": 1.0, "step": 453, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 350.71875, "epoch": 0.6952526799387443, "grad_norm": 4.072342990706237, "kl": 0.06298828125, "learning_rate": 2.121727338399814e-07, "loss": 0.0001, "reward": 3.081249952316284, "reward_std": 0.3716287612915039, "rewards/accuracy_reward": 1.78125, "rewards/format_reward": 1.0, "step": 454, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 319.65625, "epoch": 0.6967840735068913, "grad_norm": 7.167623199805889, "kl": 0.0634765625, "learning_rate": 2.1020910584575891e-07, "loss": 0.0001, "reward": 3.1687498092651367, "reward_std": 0.4107249975204468, "rewards/accuracy_reward": 1.943750023841858, "rewards/format_reward": 1.0, "step": 455, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 307.84375, "epoch": 0.6983154670750383, "grad_norm": 9.46426779830963, "kl": 0.07275390625, "learning_rate": 2.0825218529900508e-07, "loss": 0.0001, "reward": 3.84375, "reward_std": 0.5541188716888428, "rewards/accuracy_reward": 2.6187498569488525, "rewards/format_reward": 1.0, "step": 456, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 323.375, "epoch": 0.6998468606431854, "grad_norm": 7.578977726220703, "kl": 0.06689453125, "learning_rate": 2.0630201749424796e-07, "loss": 0.0001, "reward": 3.9875001907348633, "reward_std": 0.6910339593887329, "rewards/accuracy_reward": 2.6875, "rewards/format_reward": 1.0, "step": 457, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 353.0, "epoch": 0.7013782542113323, "grad_norm": 5.199030719704364, "kl": 0.06494140625, "learning_rate": 2.0435864756971778e-07, "loss": 0.0001, "reward": 3.5437498092651367, "reward_std": 0.589798092842102, "rewards/accuracy_reward": 2.2437500953674316, "rewards/format_reward": 1.0, "step": 458, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 347.25, "epoch": 0.7029096477794793, "grad_norm": 22.602000771073744, "kl": 0.0625, "learning_rate": 2.0242212050630232e-07, "loss": 0.0001, "reward": 3.5437498092651367, "reward_std": 0.4135865271091461, "rewards/accuracy_reward": 2.2437498569488525, "rewards/format_reward": 1.0, "step": 459, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 322.78125, "epoch": 0.7044410413476263, "grad_norm": 9.367843330693155, "kl": 0.0703125, "learning_rate": 2.0049248112650563e-07, "loss": 0.0001, "reward": 4.237500190734863, "reward_std": 0.5218789577484131, "rewards/accuracy_reward": 2.9375, "rewards/format_reward": 1.0, "step": 460, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 343.8125, "epoch": 0.7059724349157733, "grad_norm": 15.50592945792412, "kl": 0.058349609375, "learning_rate": 1.9856977409341086e-07, "loss": 0.0001, "reward": 3.5625, "reward_std": 0.25935813784599304, "rewards/accuracy_reward": 2.2624998092651367, "rewards/format_reward": 1.0, "step": 461, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 305.78125, "epoch": 0.7075038284839203, "grad_norm": 11.738727534172867, "kl": 0.0615234375, "learning_rate": 1.9665404390964597e-07, "loss": 0.0001, "reward": 4.300000190734863, "reward_std": 0.417613685131073, "rewards/accuracy_reward": 2.999999761581421, "rewards/format_reward": 1.0, "step": 462, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 284.6875, "epoch": 0.7090352220520674, "grad_norm": 9.156696648441521, "kl": 0.07275390625, "learning_rate": 1.947453349163547e-07, "loss": 0.0001, "reward": 4.09375, "reward_std": 0.3098074197769165, "rewards/accuracy_reward": 2.7937498092651367, "rewards/format_reward": 1.0, "step": 463, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 321.75, "epoch": 0.7105666156202144, "grad_norm": 11.321426847522837, "kl": 0.061767578125, "learning_rate": 1.9284369129216892e-07, "loss": 0.0001, "reward": 3.2124998569488525, "reward_std": 0.4076748788356781, "rewards/accuracy_reward": 1.912500023841858, "rewards/format_reward": 1.0, "step": 464, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 306.65625, "epoch": 0.7120980091883614, "grad_norm": 5.638771275332884, "kl": 0.06494140625, "learning_rate": 1.9094915705218711e-07, "loss": 0.0001, "reward": 3.3812499046325684, "reward_std": 0.32593533396720886, "rewards/accuracy_reward": 2.081249952316284, "rewards/format_reward": 1.0, "step": 465, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 314.96875, "epoch": 0.7136294027565084, "grad_norm": 14.944883534220684, "kl": 0.07568359375, "learning_rate": 1.89061776046955e-07, "loss": 0.0001, "reward": 3.9000000953674316, "reward_std": 0.5437703132629395, "rewards/accuracy_reward": 2.5999999046325684, "rewards/format_reward": 1.0, "step": 466, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 327.71875, "epoch": 0.7151607963246555, "grad_norm": 5.386863078488171, "kl": 0.059326171875, "learning_rate": 1.8718159196145089e-07, "loss": 0.0001, "reward": 3.25, "reward_std": 0.34646961092948914, "rewards/accuracy_reward": 2.0250000953674316, "rewards/format_reward": 1.0, "step": 467, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 334.9375, "epoch": 0.7166921898928025, "grad_norm": 9.443240303349306, "kl": 0.0693359375, "learning_rate": 1.853086483140749e-07, "loss": 0.0001, "reward": 3.3187499046325684, "reward_std": 0.3992847502231598, "rewards/accuracy_reward": 2.018749952316284, "rewards/format_reward": 1.0, "step": 468, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 315.9375, "epoch": 0.7182235834609495, "grad_norm": 8.370818598421605, "kl": 0.0732421875, "learning_rate": 1.8344298845564072e-07, "loss": 0.0001, "reward": 3.481250047683716, "reward_std": 0.41231366991996765, "rewards/accuracy_reward": 2.2562499046325684, "rewards/format_reward": 1.0, "step": 469, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 311.0625, "epoch": 0.7197549770290965, "grad_norm": 9.241300531058982, "kl": 0.06982421875, "learning_rate": 1.8158465556837304e-07, "loss": 0.0001, "reward": 3.331249952316284, "reward_std": 0.44833511114120483, "rewards/accuracy_reward": 2.106250047683716, "rewards/format_reward": 1.0, "step": 470, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 279.6875, "epoch": 0.7212863705972435, "grad_norm": 7.054289319864945, "kl": 0.07080078125, "learning_rate": 1.797336926649078e-07, "loss": 0.0001, "reward": 4.387499809265137, "reward_std": 0.4259355068206787, "rewards/accuracy_reward": 3.0875003337860107, "rewards/format_reward": 1.0, "step": 471, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 320.40625, "epoch": 0.7228177641653905, "grad_norm": 4.2441745477087895, "kl": 0.072265625, "learning_rate": 1.7789014258729657e-07, "loss": 0.0001, "reward": 3.875, "reward_std": 0.535858154296875, "rewards/accuracy_reward": 2.575000047683716, "rewards/format_reward": 1.0, "step": 472, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 310.90625, "epoch": 0.7243491577335375, "grad_norm": 19.64611885820091, "kl": 0.0634765625, "learning_rate": 1.7605404800601498e-07, "loss": 0.0001, "reward": 3.53125, "reward_std": 0.4168233573436737, "rewards/accuracy_reward": 2.231250047683716, "rewards/format_reward": 1.0, "step": 473, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 347.59375, "epoch": 0.7258805513016845, "grad_norm": 6.744904142124907, "kl": 0.06201171875, "learning_rate": 1.7422545141897522e-07, "loss": 0.0001, "reward": 3.7312498092651367, "reward_std": 0.4904130697250366, "rewards/accuracy_reward": 2.4312498569488525, "rewards/format_reward": 1.0, "step": 474, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 302.1875, "epoch": 0.7274119448698315, "grad_norm": 7.837268220751343, "kl": 0.0703125, "learning_rate": 1.7240439515054218e-07, "loss": 0.0001, "reward": 3.3312501907348633, "reward_std": 0.13335174322128296, "rewards/accuracy_reward": 2.03125, "rewards/format_reward": 1.0, "step": 475, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 348.21875, "epoch": 0.7289433384379785, "grad_norm": 5.818240284000521, "kl": 0.062255859375, "learning_rate": 1.705909213505537e-07, "loss": 0.0001, "reward": 3.231250047683716, "reward_std": 0.3217264711856842, "rewards/accuracy_reward": 1.9312500953674316, "rewards/format_reward": 1.0, "step": 476, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 346.71875, "epoch": 0.7304747320061256, "grad_norm": 7.677825777713008, "kl": 0.06396484375, "learning_rate": 1.687850719933458e-07, "loss": 0.0001, "reward": 3.887500047683716, "reward_std": 0.5294345617294312, "rewards/accuracy_reward": 2.5875000953674316, "rewards/format_reward": 1.0, "step": 477, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 314.5625, "epoch": 0.7320061255742726, "grad_norm": 7.475514037163483, "kl": 0.0673828125, "learning_rate": 1.6698688887677993e-07, "loss": 0.0001, "reward": 3.518749952316284, "reward_std": 0.284912109375, "rewards/accuracy_reward": 2.21875, "rewards/format_reward": 1.0, "step": 478, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 362.65625, "epoch": 0.7335375191424196, "grad_norm": 12.349104248475921, "kl": 0.0595703125, "learning_rate": 1.6519641362127628e-07, "loss": 0.0001, "reward": 3.3125, "reward_std": 0.5435852408409119, "rewards/accuracy_reward": 2.0124998092651367, "rewards/format_reward": 1.0, "step": 479, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 319.25, "epoch": 0.7350689127105666, "grad_norm": 4.048801751798312, "kl": 0.0751953125, "learning_rate": 1.634136876688504e-07, "loss": 0.0001, "reward": 3.543750047683716, "reward_std": 0.42121684551239014, "rewards/accuracy_reward": 2.2437500953674316, "rewards/format_reward": 1.0, "step": 480, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 321.75, "epoch": 0.7366003062787136, "grad_norm": 6.4005433756299785, "kl": 0.06787109375, "learning_rate": 1.6163875228215351e-07, "loss": 0.0001, "reward": 3.5999999046325684, "reward_std": 0.42963576316833496, "rewards/accuracy_reward": 2.3000001907348633, "rewards/format_reward": 1.0, "step": 481, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 292.5, "epoch": 0.7381316998468607, "grad_norm": 9.877103419548662, "kl": 0.07421875, "learning_rate": 1.5987164854351858e-07, "loss": 0.0001, "reward": 4.78125, "reward_std": 0.40715551376342773, "rewards/accuracy_reward": 3.481250047683716, "rewards/format_reward": 1.0, "step": 482, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 302.5, "epoch": 0.7396630934150077, "grad_norm": 6.164020418879224, "kl": 0.068359375, "learning_rate": 1.5811241735400793e-07, "loss": 0.0001, "reward": 3.34375, "reward_std": 0.49206188321113586, "rewards/accuracy_reward": 2.1187498569488525, "rewards/format_reward": 1.0, "step": 483, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 335.5, "epoch": 0.7411944869831547, "grad_norm": 9.137767557498421, "kl": 0.0634765625, "learning_rate": 1.5636109943246762e-07, "loss": 0.0001, "reward": 4.506250381469727, "reward_std": 0.4679912328720093, "rewards/accuracy_reward": 3.206249952316284, "rewards/format_reward": 1.0, "step": 484, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 338.28125, "epoch": 0.7427258805513017, "grad_norm": 6.2818164268747285, "kl": 0.06005859375, "learning_rate": 1.5461773531458455e-07, "loss": 0.0001, "reward": 3.362499952316284, "reward_std": 0.48934221267700195, "rewards/accuracy_reward": 2.0625, "rewards/format_reward": 1.0, "step": 485, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 275.0625, "epoch": 0.7442572741194488, "grad_norm": 16.37769015232479, "kl": 0.078125, "learning_rate": 1.5288236535194815e-07, "loss": 0.0001, "reward": 3.4437499046325684, "reward_std": 0.33781734108924866, "rewards/accuracy_reward": 2.143749952316284, "rewards/format_reward": 1.0, "step": 486, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 321.6875, "epoch": 0.7457886676875957, "grad_norm": 12.209190420784717, "kl": 0.061767578125, "learning_rate": 1.5115502971111733e-07, "loss": 0.0001, "reward": 3.8812501430511475, "reward_std": 0.5911651849746704, "rewards/accuracy_reward": 2.581249952316284, "rewards/format_reward": 1.0, "step": 487, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 283.90625, "epoch": 0.7473200612557427, "grad_norm": 9.204315985416914, "kl": 0.06640625, "learning_rate": 1.4943576837268896e-07, "loss": 0.0001, "reward": 4.5, "reward_std": 0.5566960573196411, "rewards/accuracy_reward": 3.2749998569488525, "rewards/format_reward": 1.0, "step": 488, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 338.875, "epoch": 0.7488514548238897, "grad_norm": 6.523657451114991, "kl": 0.0654296875, "learning_rate": 1.4772462113037431e-07, "loss": 0.0001, "reward": 4.318749904632568, "reward_std": 0.24353675544261932, "rewards/accuracy_reward": 3.018749952316284, "rewards/format_reward": 1.0, "step": 489, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 278.6875, "epoch": 0.7503828483920367, "grad_norm": 5.213619993552357, "kl": 0.06982421875, "learning_rate": 1.460216275900769e-07, "loss": 0.0001, "reward": 4.1875, "reward_std": 0.5140166282653809, "rewards/accuracy_reward": 2.887500047683716, "rewards/format_reward": 1.0, "step": 490, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 286.8125, "epoch": 0.7519142419601837, "grad_norm": 6.670540126650134, "kl": 0.0712890625, "learning_rate": 1.443268271689766e-07, "loss": 0.0001, "reward": 3.96875, "reward_std": 0.3987843990325928, "rewards/accuracy_reward": 2.6687498092651367, "rewards/format_reward": 1.0, "step": 491, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 289.875, "epoch": 0.7534456355283308, "grad_norm": 7.026601512632626, "kl": 0.076171875, "learning_rate": 1.426402590946163e-07, "loss": 0.0001, "reward": 3.625, "reward_std": 0.35273683071136475, "rewards/accuracy_reward": 2.325000047683716, "rewards/format_reward": 1.0, "step": 492, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 295.5, "epoch": 0.7549770290964778, "grad_norm": 7.503226154358219, "kl": 0.0634765625, "learning_rate": 1.4096196240399478e-07, "loss": 0.0001, "reward": 4.400000095367432, "reward_std": 0.39059334993362427, "rewards/accuracy_reward": 3.0999999046325684, "rewards/format_reward": 1.0, "step": 493, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 333.71875, "epoch": 0.7565084226646248, "grad_norm": 4.980978402620444, "kl": 0.078125, "learning_rate": 1.392919759426628e-07, "loss": 0.0001, "reward": 3.6062498092651367, "reward_std": 0.5807459354400635, "rewards/accuracy_reward": 2.3062500953674316, "rewards/format_reward": 1.0, "step": 494, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 352.5625, "epoch": 0.7580398162327718, "grad_norm": 7.798753483587729, "kl": 0.0693359375, "learning_rate": 1.3763033836382392e-07, "loss": 0.0001, "reward": 2.887500047683716, "reward_std": 0.23938804864883423, "rewards/accuracy_reward": 1.5875000953674316, "rewards/format_reward": 1.0, "step": 495, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 334.9375, "epoch": 0.7595712098009189, "grad_norm": 10.083985352107655, "kl": 0.06591796875, "learning_rate": 1.3597708812744034e-07, "loss": 0.0001, "reward": 3.4312498569488525, "reward_std": 0.23019403219223022, "rewards/accuracy_reward": 2.1312499046325684, "rewards/format_reward": 1.0, "step": 496, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 325.8125, "epoch": 0.7611026033690659, "grad_norm": 9.050885384906376, "kl": 0.0634765625, "learning_rate": 1.343322634993421e-07, "loss": 0.0001, "reward": 3.887500047683716, "reward_std": 0.32818514108657837, "rewards/accuracy_reward": 2.5875000953674316, "rewards/format_reward": 1.0, "step": 497, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.875, "epoch": 0.7626339969372129, "grad_norm": 5.378885028637031, "kl": 0.06298828125, "learning_rate": 1.3269590255034163e-07, "loss": 0.0001, "reward": 3.2937498092651367, "reward_std": 0.30960196256637573, "rewards/accuracy_reward": 1.993749976158142, "rewards/format_reward": 1.0, "step": 498, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 309.15625, "epoch": 0.7641653905053599, "grad_norm": 22.904067920632766, "kl": 0.058349609375, "learning_rate": 1.3106804315535264e-07, "loss": 0.0001, "reward": 4.3125, "reward_std": 0.5876265168190002, "rewards/accuracy_reward": 3.012500047683716, "rewards/format_reward": 1.0, "step": 499, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 340.78125, "epoch": 0.7656967840735069, "grad_norm": 9.043834005585135, "kl": 0.0751953125, "learning_rate": 1.294487229925132e-07, "loss": 0.0001, "reward": 2.7874999046325684, "reward_std": 0.36112481355667114, "rewards/accuracy_reward": 1.5625, "rewards/format_reward": 1.0, "step": 500, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 359.5, "epoch": 0.7672281776416539, "grad_norm": 5.84942052999165, "kl": 0.076171875, "learning_rate": 1.278379795423145e-07, "loss": 0.0001, "reward": 3.3374998569488525, "reward_std": 0.6417471766471863, "rewards/accuracy_reward": 2.0375001430511475, "rewards/format_reward": 1.0, "step": 501, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 315.59375, "epoch": 0.7687595712098009, "grad_norm": 8.322484842813823, "kl": 0.0791015625, "learning_rate": 1.262358500867318e-07, "loss": 0.0001, "reward": 3.875, "reward_std": 0.3749288320541382, "rewards/accuracy_reward": 2.575000047683716, "rewards/format_reward": 1.0, "step": 502, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 279.40625, "epoch": 0.7702909647779479, "grad_norm": 6.2770678767323105, "kl": 0.072265625, "learning_rate": 1.2464237170836313e-07, "loss": 0.0001, "reward": 3.7624998092651367, "reward_std": 0.2802865505218506, "rewards/accuracy_reward": 2.4625000953674316, "rewards/format_reward": 1.0, "step": 503, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 344.09375, "epoch": 0.7718223583460949, "grad_norm": 4.445773768777764, "kl": 0.0654296875, "learning_rate": 1.2305758128956973e-07, "loss": 0.0001, "reward": 3.5625, "reward_std": 0.491230309009552, "rewards/accuracy_reward": 2.2624998092651367, "rewards/format_reward": 1.0, "step": 504, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 325.53125, "epoch": 0.7733537519142419, "grad_norm": 4.920107986633787, "kl": 0.0654296875, "learning_rate": 1.2148151551162345e-07, "loss": 0.0001, "reward": 3.7249999046325684, "reward_std": 0.39738136529922485, "rewards/accuracy_reward": 2.424999952316284, "rewards/format_reward": 1.0, "step": 505, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 331.09375, "epoch": 0.774885145482389, "grad_norm": 6.548906578710591, "kl": 0.064453125, "learning_rate": 1.1991421085385672e-07, "loss": 0.0001, "reward": 4.168749809265137, "reward_std": 0.5465906262397766, "rewards/accuracy_reward": 2.8687500953674316, "rewards/format_reward": 1.0, "step": 506, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 298.6875, "epoch": 0.776416539050536, "grad_norm": 7.141153189502872, "kl": 0.0673828125, "learning_rate": 1.1835570359281893e-07, "loss": 0.0001, "reward": 3.6812500953674316, "reward_std": 0.43552249670028687, "rewards/accuracy_reward": 2.3812499046325684, "rewards/format_reward": 1.0, "step": 507, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 315.0625, "epoch": 0.777947932618683, "grad_norm": 6.2961568088398705, "kl": 0.06982421875, "learning_rate": 1.1680602980143639e-07, "loss": 0.0001, "reward": 4.149999618530273, "reward_std": 0.4107191562652588, "rewards/accuracy_reward": 2.8500001430511475, "rewards/format_reward": 1.0, "step": 508, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 306.5, "epoch": 0.77947932618683, "grad_norm": 5.438986938486748, "kl": 0.0634765625, "learning_rate": 1.152652253481774e-07, "loss": 0.0001, "reward": 3.78125, "reward_std": 0.5216482877731323, "rewards/accuracy_reward": 2.5562500953674316, "rewards/format_reward": 1.0, "step": 509, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 312.5625, "epoch": 0.781010719754977, "grad_norm": 8.038160550760889, "kl": 0.064453125, "learning_rate": 1.137333258962227e-07, "loss": 0.0001, "reward": 3.5, "reward_std": 0.43106070160865784, "rewards/accuracy_reward": 2.1999998092651367, "rewards/format_reward": 1.0, "step": 510, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 336.34375, "epoch": 0.7825421133231241, "grad_norm": 5.433265521527188, "kl": 0.0654296875, "learning_rate": 1.1221036690263885e-07, "loss": 0.0001, "reward": 3.9749999046325684, "reward_std": 0.25459665060043335, "rewards/accuracy_reward": 2.674999952316284, "rewards/format_reward": 1.0, "step": 511, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 340.25, "epoch": 0.7840735068912711, "grad_norm": 8.717682371268383, "kl": 0.064453125, "learning_rate": 1.1069638361755857e-07, "loss": 0.0001, "reward": 3.831249952316284, "reward_std": 0.4424981474876404, "rewards/accuracy_reward": 2.53125, "rewards/format_reward": 1.0, "step": 512, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 319.125, "epoch": 0.7856049004594181, "grad_norm": 14.452852305560832, "kl": 0.0673828125, "learning_rate": 1.0919141108336433e-07, "loss": 0.0001, "reward": 4.125, "reward_std": 0.6297014355659485, "rewards/accuracy_reward": 2.8249998092651367, "rewards/format_reward": 1.0, "step": 513, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 318.25, "epoch": 0.7871362940275651, "grad_norm": 8.46706644871554, "kl": 0.07177734375, "learning_rate": 1.0769548413387719e-07, "loss": 0.0001, "reward": 4.143750190734863, "reward_std": 0.3131358325481415, "rewards/accuracy_reward": 2.843750238418579, "rewards/format_reward": 1.0, "step": 514, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 320.6875, "epoch": 0.7886676875957122, "grad_norm": 6.559956586845839, "kl": 0.06787109375, "learning_rate": 1.0620863739355135e-07, "loss": 0.0001, "reward": 3.7562499046325684, "reward_std": 0.4320494532585144, "rewards/accuracy_reward": 2.456249952316284, "rewards/format_reward": 1.0, "step": 515, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 311.0, "epoch": 0.7901990811638591, "grad_norm": 20.319001292719538, "kl": 0.0712890625, "learning_rate": 1.0473090527667166e-07, "loss": 0.0001, "reward": 3.5812501907348633, "reward_std": 0.46948492527008057, "rewards/accuracy_reward": 2.28125, "rewards/format_reward": 1.0, "step": 516, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 351.1875, "epoch": 0.7917304747320061, "grad_norm": 11.42060678226176, "kl": 0.072265625, "learning_rate": 1.0326232198655738e-07, "loss": 0.0001, "reward": 3.5124998092651367, "reward_std": 0.30364906787872314, "rewards/accuracy_reward": 2.2124998569488525, "rewards/format_reward": 1.0, "step": 517, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 341.53125, "epoch": 0.7932618683001531, "grad_norm": 10.373305893594383, "kl": 0.0625, "learning_rate": 1.0180292151477099e-07, "loss": 0.0001, "reward": 3.6750001907348633, "reward_std": 0.5693778991699219, "rewards/accuracy_reward": 2.375, "rewards/format_reward": 1.0, "step": 518, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 372.0, "epoch": 0.7947932618683001, "grad_norm": 5.20394549436034, "kl": 0.060302734375, "learning_rate": 1.0035273764033131e-07, "loss": 0.0001, "reward": 3.9124996662139893, "reward_std": 0.3591833710670471, "rewards/accuracy_reward": 2.612499952316284, "rewards/format_reward": 1.0, "step": 519, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 350.375, "epoch": 0.7963246554364471, "grad_norm": 5.9918206526844955, "kl": 0.064453125, "learning_rate": 9.891180392893117e-08, "loss": 0.0001, "reward": 3.9749999046325684, "reward_std": 0.3856534957885742, "rewards/accuracy_reward": 2.674999952316284, "rewards/format_reward": 1.0, "step": 520, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 312.625, "epoch": 0.7978560490045942, "grad_norm": 3.749658955032504, "kl": 0.07177734375, "learning_rate": 9.748015373216078e-08, "loss": 0.0001, "reward": 4.337499618530273, "reward_std": 0.4496949315071106, "rewards/accuracy_reward": 3.0375001430511475, "rewards/format_reward": 1.0, "step": 521, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 353.625, "epoch": 0.7993874425727412, "grad_norm": 4.004240398257631, "kl": 0.0615234375, "learning_rate": 9.605782018673591e-08, "loss": 0.0001, "reward": 3.1937499046325684, "reward_std": 0.45035600662231445, "rewards/accuracy_reward": 1.8937500715255737, "rewards/format_reward": 1.0, "step": 522, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 340.8125, "epoch": 0.8009188361408882, "grad_norm": 4.644865902306151, "kl": 0.068359375, "learning_rate": 9.464483621373076e-08, "loss": 0.0001, "reward": 3.6187500953674316, "reward_std": 0.6933550834655762, "rewards/accuracy_reward": 2.3937501907348633, "rewards/format_reward": 1.0, "step": 523, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 316.75, "epoch": 0.8024502297090352, "grad_norm": 6.067119684325836, "kl": 0.06640625, "learning_rate": 9.324123451781618e-08, "loss": 0.0001, "reward": 3.78125, "reward_std": 0.3978561460971832, "rewards/accuracy_reward": 2.4812498092651367, "rewards/format_reward": 1.0, "step": 524, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 351.84375, "epoch": 0.8039816232771823, "grad_norm": 6.624482449249475, "kl": 0.064453125, "learning_rate": 9.184704758650241e-08, "loss": 0.0001, "reward": 3.606250047683716, "reward_std": 0.8364351987838745, "rewards/accuracy_reward": 2.3062500953674316, "rewards/format_reward": 1.0, "step": 525, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 303.375, "epoch": 0.8055130168453293, "grad_norm": 6.280530978489687, "kl": 0.0830078125, "learning_rate": 9.046230768938718e-08, "loss": 0.0001, "reward": 4.34375, "reward_std": 0.5486670136451721, "rewards/accuracy_reward": 3.043750047683716, "rewards/format_reward": 1.0, "step": 526, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 320.90625, "epoch": 0.8070444104134763, "grad_norm": 10.186624617343176, "kl": 0.07763671875, "learning_rate": 8.908704687740898e-08, "loss": 0.0001, "reward": 3.606250047683716, "reward_std": 0.20874956250190735, "rewards/accuracy_reward": 2.3062498569488525, "rewards/format_reward": 1.0, "step": 527, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 320.59375, "epoch": 0.8085758039816233, "grad_norm": 7.133477662889388, "kl": 0.072265625, "learning_rate": 8.772129698210495e-08, "loss": 0.0001, "reward": 3.5687499046325684, "reward_std": 0.3917831778526306, "rewards/accuracy_reward": 2.268749952316284, "rewards/format_reward": 1.0, "step": 528, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 299.6875, "epoch": 0.8101071975497703, "grad_norm": 5.393522043538054, "kl": 0.07568359375, "learning_rate": 8.636508961487471e-08, "loss": 0.0001, "reward": 4.518750190734863, "reward_std": 0.48873287439346313, "rewards/accuracy_reward": 3.21875, "rewards/format_reward": 1.0, "step": 529, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 345.46875, "epoch": 0.8116385911179173, "grad_norm": 6.620096667191019, "kl": 0.064453125, "learning_rate": 8.501845616624798e-08, "loss": 0.0001, "reward": 3.6937501430511475, "reward_std": 0.3262782394886017, "rewards/accuracy_reward": 2.393749952316284, "rewards/format_reward": 1.0, "step": 530, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 297.40625, "epoch": 0.8131699846860643, "grad_norm": 8.442037927760953, "kl": 0.06982421875, "learning_rate": 8.368142780515796e-08, "loss": 0.0001, "reward": 4.737500190734863, "reward_std": 0.48170897364616394, "rewards/accuracy_reward": 3.437499761581421, "rewards/format_reward": 1.0, "step": 531, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 340.4375, "epoch": 0.8147013782542113, "grad_norm": 9.915935858659468, "kl": 0.06689453125, "learning_rate": 8.235403547822062e-08, "loss": 0.0001, "reward": 3.0187501907348633, "reward_std": 0.5618056058883667, "rewards/accuracy_reward": 1.7937500476837158, "rewards/format_reward": 1.0, "step": 532, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 312.84375, "epoch": 0.8162327718223583, "grad_norm": 5.261205124298314, "kl": 0.07421875, "learning_rate": 8.103630990901827e-08, "loss": 0.0001, "reward": 3.0374999046325684, "reward_std": 0.3254516124725342, "rewards/accuracy_reward": 1.7375001907348633, "rewards/format_reward": 1.0, "step": 533, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 324.5625, "epoch": 0.8177641653905053, "grad_norm": 8.964171987309859, "kl": 0.07421875, "learning_rate": 7.972828159738765e-08, "loss": 0.0001, "reward": 3.4124999046325684, "reward_std": 0.4482683539390564, "rewards/accuracy_reward": 2.1125001907348633, "rewards/format_reward": 1.0, "step": 534, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 367.5625, "epoch": 0.8192955589586524, "grad_norm": 9.20349897028433, "kl": 0.06982421875, "learning_rate": 7.842998081871493e-08, "loss": 0.0001, "reward": 3.0625, "reward_std": 0.35382628440856934, "rewards/accuracy_reward": 1.7625000476837158, "rewards/format_reward": 1.0, "step": 535, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 368.09375, "epoch": 0.8208269525267994, "grad_norm": 7.456768649523639, "kl": 0.06201171875, "learning_rate": 7.714143762323433e-08, "loss": 0.0001, "reward": 4.1875, "reward_std": 0.39741051197052, "rewards/accuracy_reward": 2.8874998092651367, "rewards/format_reward": 1.0, "step": 536, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 340.03125, "epoch": 0.8223583460949464, "grad_norm": 5.3780765218567845, "kl": 0.064453125, "learning_rate": 7.58626818353329e-08, "loss": 0.0001, "reward": 3.8187499046325684, "reward_std": 0.39980944991111755, "rewards/accuracy_reward": 2.518749952316284, "rewards/format_reward": 1.0, "step": 537, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 382.625, "epoch": 0.8238897396630934, "grad_norm": 4.210494443133476, "kl": 0.064453125, "learning_rate": 7.459374305286009e-08, "loss": 0.0001, "reward": 4.081250190734863, "reward_std": 0.30467599630355835, "rewards/accuracy_reward": 2.78125, "rewards/format_reward": 1.0, "step": 538, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 382.3125, "epoch": 0.8254211332312404, "grad_norm": 6.8239106890424885, "kl": 0.0634765625, "learning_rate": 7.333465064644301e-08, "loss": 0.0001, "reward": 3.237499952316284, "reward_std": 0.43901118636131287, "rewards/accuracy_reward": 1.9375, "rewards/format_reward": 1.0, "step": 539, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 300.5625, "epoch": 0.8269525267993875, "grad_norm": 8.104808281105164, "kl": 0.07763671875, "learning_rate": 7.208543375880594e-08, "loss": 0.0001, "reward": 3.2437500953674316, "reward_std": 0.44948697090148926, "rewards/accuracy_reward": 1.9437499046325684, "rewards/format_reward": 1.0, "step": 540, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 361.0625, "epoch": 0.8284839203675345, "grad_norm": 6.257706588887124, "kl": 0.06591796875, "learning_rate": 7.084612130409634e-08, "loss": 0.0001, "reward": 3.9125001430511475, "reward_std": 0.39685332775115967, "rewards/accuracy_reward": 2.612499952316284, "rewards/format_reward": 1.0, "step": 541, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 333.9375, "epoch": 0.8300153139356815, "grad_norm": 7.447523308777689, "kl": 0.0654296875, "learning_rate": 6.961674196721556e-08, "loss": 0.0001, "reward": 3.674999713897705, "reward_std": 0.4210602045059204, "rewards/accuracy_reward": 2.375, "rewards/format_reward": 1.0, "step": 542, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 349.21875, "epoch": 0.8315467075038285, "grad_norm": 5.213659250225261, "kl": 0.06982421875, "learning_rate": 6.839732420315458e-08, "loss": 0.0001, "reward": 3.387500047683716, "reward_std": 0.20724307000637054, "rewards/accuracy_reward": 2.0875000953674316, "rewards/format_reward": 1.0, "step": 543, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 311.53125, "epoch": 0.8330781010719756, "grad_norm": 6.187298433070561, "kl": 0.07373046875, "learning_rate": 6.718789623633597e-08, "loss": 0.0001, "reward": 4.181249618530273, "reward_std": 0.3611743152141571, "rewards/accuracy_reward": 2.8812499046325684, "rewards/format_reward": 1.0, "step": 544, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 350.90625, "epoch": 0.8346094946401225, "grad_norm": 6.411269307309986, "kl": 0.06884765625, "learning_rate": 6.598848605996004e-08, "loss": 0.0001, "reward": 3.737499952316284, "reward_std": 0.5504343509674072, "rewards/accuracy_reward": 2.512500047683716, "rewards/format_reward": 1.0, "step": 545, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 388.3125, "epoch": 0.8361408882082695, "grad_norm": 4.73243763405586, "kl": 0.0654296875, "learning_rate": 6.479912143535699e-08, "loss": 0.0001, "reward": 3.0999999046325684, "reward_std": 0.25826239585876465, "rewards/accuracy_reward": 1.875, "rewards/format_reward": 1.0, "step": 546, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 280.78125, "epoch": 0.8376722817764165, "grad_norm": 5.847861898680038, "kl": 0.08203125, "learning_rate": 6.361982989134468e-08, "loss": 0.0001, "reward": 3.8187501430511475, "reward_std": 0.48482370376586914, "rewards/accuracy_reward": 2.5187501907348633, "rewards/format_reward": 1.0, "step": 547, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 339.15625, "epoch": 0.8392036753445635, "grad_norm": 8.739221622928392, "kl": 0.0732421875, "learning_rate": 6.245063872359141e-08, "loss": 0.0001, "reward": 4.131249904632568, "reward_std": 0.402154803276062, "rewards/accuracy_reward": 2.831249952316284, "rewards/format_reward": 1.0, "step": 548, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 361.96875, "epoch": 0.8407350689127105, "grad_norm": 6.410308165004863, "kl": 0.06591796875, "learning_rate": 6.129157499398385e-08, "loss": 0.0001, "reward": 3.6624999046325684, "reward_std": 0.28865599632263184, "rewards/accuracy_reward": 2.3625001907348633, "rewards/format_reward": 1.0, "step": 549, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 312.21875, "epoch": 0.8422664624808576, "grad_norm": 6.139748962368488, "kl": 0.0751953125, "learning_rate": 6.014266553000074e-08, "loss": 0.0001, "reward": 3.9125001430511475, "reward_std": 0.3542941212654114, "rewards/accuracy_reward": 2.6875, "rewards/format_reward": 1.0, "step": 550, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 341.375, "epoch": 0.8437978560490046, "grad_norm": 14.756876501209561, "kl": 0.06884765625, "learning_rate": 5.900393692409222e-08, "loss": 0.0001, "reward": 3.206249952316284, "reward_std": 0.3955453038215637, "rewards/accuracy_reward": 1.9812500476837158, "rewards/format_reward": 1.0, "step": 551, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 304.34375, "epoch": 0.8453292496171516, "grad_norm": 8.973776709804563, "kl": 0.07861328125, "learning_rate": 5.787541553306385e-08, "loss": 0.0001, "reward": 3.862499952316284, "reward_std": 0.4274066686630249, "rewards/accuracy_reward": 2.5625, "rewards/format_reward": 1.0, "step": 552, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 365.25, "epoch": 0.8468606431852986, "grad_norm": 8.468511967683463, "kl": 0.07080078125, "learning_rate": 5.6757127477467305e-08, "loss": 0.0001, "reward": 3.7624998092651367, "reward_std": 0.37143707275390625, "rewards/accuracy_reward": 2.4625000953674316, "rewards/format_reward": 1.0, "step": 553, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 296.625, "epoch": 0.8483920367534457, "grad_norm": 6.893679444223453, "kl": 0.07177734375, "learning_rate": 5.564909864099493e-08, "loss": 0.0001, "reward": 3.15625, "reward_std": 0.6041836738586426, "rewards/accuracy_reward": 1.8562499284744263, "rewards/format_reward": 1.0, "step": 554, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 304.4375, "epoch": 0.8499234303215927, "grad_norm": 6.661123517820795, "kl": 0.072265625, "learning_rate": 5.4551354669881145e-08, "loss": 0.0001, "reward": 4.0625, "reward_std": 0.24634216725826263, "rewards/accuracy_reward": 2.7624998092651367, "rewards/format_reward": 1.0, "step": 555, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 323.4375, "epoch": 0.8514548238897397, "grad_norm": 8.99928636834237, "kl": 0.07470703125, "learning_rate": 5.34639209723089e-08, "loss": 0.0001, "reward": 3.6999998092651367, "reward_std": 0.35582679510116577, "rewards/accuracy_reward": 2.4000000953674316, "rewards/format_reward": 1.0, "step": 556, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 315.84375, "epoch": 0.8529862174578867, "grad_norm": 15.728098381965413, "kl": 0.0751953125, "learning_rate": 5.238682271782102e-08, "loss": 0.0001, "reward": 3.8812499046325684, "reward_std": 0.5641553401947021, "rewards/accuracy_reward": 2.5812501907348633, "rewards/format_reward": 1.0, "step": 557, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 317.5625, "epoch": 0.8545176110260337, "grad_norm": 9.747780740390152, "kl": 0.0673828125, "learning_rate": 5.132008483673872e-08, "loss": 0.0001, "reward": 4.112500190734863, "reward_std": 0.25696486234664917, "rewards/accuracy_reward": 2.8125, "rewards/format_reward": 1.0, "step": 558, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 330.03125, "epoch": 0.8560490045941807, "grad_norm": 58.7445907158911, "kl": 0.068359375, "learning_rate": 5.0263732019583335e-08, "loss": 0.0001, "reward": 3.3499999046325684, "reward_std": 0.2300891876220703, "rewards/accuracy_reward": 2.049999952316284, "rewards/format_reward": 1.0, "step": 559, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 341.1875, "epoch": 0.8575803981623277, "grad_norm": 15.018191525656299, "kl": 0.068359375, "learning_rate": 4.921778871650539e-08, "loss": 0.0001, "reward": 3.581249952316284, "reward_std": 0.33813318610191345, "rewards/accuracy_reward": 2.28125, "rewards/format_reward": 1.0, "step": 560, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 339.40625, "epoch": 0.8591117917304747, "grad_norm": 10.084138368008286, "kl": 0.0654296875, "learning_rate": 4.818227913671891e-08, "loss": 0.0001, "reward": 3.268749952316284, "reward_std": 0.536249041557312, "rewards/accuracy_reward": 2.0437498092651367, "rewards/format_reward": 1.0, "step": 561, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 310.3125, "epoch": 0.8606431852986217, "grad_norm": 6.380660496138349, "kl": 0.0693359375, "learning_rate": 4.715722724794091e-08, "loss": 0.0001, "reward": 3.9312500953674316, "reward_std": 0.5714499950408936, "rewards/accuracy_reward": 2.6312499046325684, "rewards/format_reward": 1.0, "step": 562, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 325.5, "epoch": 0.8621745788667687, "grad_norm": 4.148426286122439, "kl": 0.0712890625, "learning_rate": 4.6142656775836395e-08, "loss": 0.0001, "reward": 3.7437496185302734, "reward_std": 0.416331022977829, "rewards/accuracy_reward": 2.4437499046325684, "rewards/format_reward": 1.0, "step": 563, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 370.3125, "epoch": 0.8637059724349158, "grad_norm": 8.631894184924093, "kl": 0.06201171875, "learning_rate": 4.513859120346947e-08, "loss": 0.0001, "reward": 3.549999952316284, "reward_std": 0.337804913520813, "rewards/accuracy_reward": 2.25, "rewards/format_reward": 1.0, "step": 564, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 342.1875, "epoch": 0.8652373660030628, "grad_norm": 4.359189319403748, "kl": 0.06103515625, "learning_rate": 4.414505377075978e-08, "loss": 0.0001, "reward": 4.15625, "reward_std": 0.41671764850616455, "rewards/accuracy_reward": 2.8562498092651367, "rewards/format_reward": 1.0, "step": 565, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 349.25, "epoch": 0.8667687595712098, "grad_norm": 7.361451262732712, "kl": 0.0673828125, "learning_rate": 4.316206747394435e-08, "loss": 0.0001, "reward": 3.799999952316284, "reward_std": 0.4484034776687622, "rewards/accuracy_reward": 2.5, "rewards/format_reward": 1.0, "step": 566, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 276.25, "epoch": 0.8683001531393568, "grad_norm": 3.973315355124972, "kl": 0.072265625, "learning_rate": 4.218965506504596e-08, "loss": 0.0001, "reward": 4.175000190734863, "reward_std": 0.519250750541687, "rewards/accuracy_reward": 2.950000047683716, "rewards/format_reward": 1.0, "step": 567, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 295.03125, "epoch": 0.8698315467075038, "grad_norm": 5.998362168235618, "kl": 0.0791015625, "learning_rate": 4.122783905134564e-08, "loss": 0.0001, "reward": 4.337499618530273, "reward_std": 0.4173644781112671, "rewards/accuracy_reward": 3.0375001430511475, "rewards/format_reward": 1.0, "step": 568, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 315.0, "epoch": 0.8713629402756509, "grad_norm": 8.798421112404203, "kl": 0.07177734375, "learning_rate": 4.0276641694862504e-08, "loss": 0.0001, "reward": 4.037499904632568, "reward_std": 0.34528836607933044, "rewards/accuracy_reward": 2.737499713897705, "rewards/format_reward": 1.0, "step": 569, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 265.4375, "epoch": 0.8728943338437979, "grad_norm": 3.2524813095600424, "kl": 0.072265625, "learning_rate": 3.933608501183788e-08, "loss": 0.0001, "reward": 4.018749713897705, "reward_std": 0.3298349678516388, "rewards/accuracy_reward": 2.71875, "rewards/format_reward": 1.0, "step": 570, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 356.5, "epoch": 0.8744257274119449, "grad_norm": 5.812136800776212, "kl": 0.061279296875, "learning_rate": 3.840619077222612e-08, "loss": 0.0001, "reward": 3.7937498092651367, "reward_std": 0.2996395230293274, "rewards/accuracy_reward": 2.4937498569488525, "rewards/format_reward": 1.0, "step": 571, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 305.5625, "epoch": 0.8759571209800919, "grad_norm": 15.570383714813701, "kl": 0.078125, "learning_rate": 3.7486980499190804e-08, "loss": 0.0001, "reward": 3.3625001907348633, "reward_std": 0.2761770784854889, "rewards/accuracy_reward": 2.137500047683716, "rewards/format_reward": 1.0, "step": 572, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 281.65625, "epoch": 0.877488514548239, "grad_norm": 7.2942510199839194, "kl": 0.07421875, "learning_rate": 3.6578475468606096e-08, "loss": 0.0001, "reward": 4.09375, "reward_std": 0.38450920581817627, "rewards/accuracy_reward": 2.7937498092651367, "rewards/format_reward": 1.0, "step": 573, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 305.09375, "epoch": 0.8790199081163859, "grad_norm": 15.251141316249088, "kl": 0.0791015625, "learning_rate": 3.568069670856466e-08, "loss": 0.0001, "reward": 3.7562499046325684, "reward_std": 0.36518940329551697, "rewards/accuracy_reward": 2.53125, "rewards/format_reward": 1.0, "step": 574, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 330.59375, "epoch": 0.8805513016845329, "grad_norm": 9.60945967657897, "kl": 0.0712890625, "learning_rate": 3.479366499889058e-08, "loss": 0.0001, "reward": 4.612500190734863, "reward_std": 0.26537150144577026, "rewards/accuracy_reward": 3.3125, "rewards/format_reward": 1.0, "step": 575, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 410.78125, "epoch": 0.8820826952526799, "grad_norm": 4.801673821010206, "kl": 0.06201171875, "learning_rate": 3.391740087065914e-08, "loss": 0.0001, "reward": 3.1500000953674316, "reward_std": 0.13879363238811493, "rewards/accuracy_reward": 1.9249999523162842, "rewards/format_reward": 1.0, "step": 576, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 387.40625, "epoch": 0.8836140888208269, "grad_norm": 38.338735179774616, "kl": 0.0693359375, "learning_rate": 3.305192460572087e-08, "loss": 0.0001, "reward": 3.674999952316284, "reward_std": 0.4538288116455078, "rewards/accuracy_reward": 2.375, "rewards/format_reward": 1.0, "step": 577, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 356.46875, "epoch": 0.885145482388974, "grad_norm": 18.580046936316474, "kl": 0.06298828125, "learning_rate": 3.219725623623243e-08, "loss": 0.0001, "reward": 3.1187500953674316, "reward_std": 0.3937293291091919, "rewards/accuracy_reward": 1.8937499523162842, "rewards/format_reward": 1.0, "step": 578, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 389.0625, "epoch": 0.886676875957121, "grad_norm": 6.789398671780015, "kl": 0.0625, "learning_rate": 3.135341554419274e-08, "loss": 0.0001, "reward": 3.5999999046325684, "reward_std": 0.4359382390975952, "rewards/accuracy_reward": 2.3000001907348633, "rewards/format_reward": 1.0, "step": 579, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 323.9375, "epoch": 0.888208269525268, "grad_norm": 5.950185827212625, "kl": 0.0673828125, "learning_rate": 3.052042206098537e-08, "loss": 0.0001, "reward": 3.75, "reward_std": 0.4836667478084564, "rewards/accuracy_reward": 2.450000047683716, "rewards/format_reward": 1.0, "step": 580, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 299.34375, "epoch": 0.889739663093415, "grad_norm": 4.44586827014121, "kl": 0.07275390625, "learning_rate": 2.9698295066926615e-08, "loss": 0.0001, "reward": 3.4250001907348633, "reward_std": 0.4025263786315918, "rewards/accuracy_reward": 2.125, "rewards/format_reward": 1.0, "step": 581, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 333.3125, "epoch": 0.891271056661562, "grad_norm": 9.428133854879954, "kl": 0.07177734375, "learning_rate": 2.8887053590818556e-08, "loss": 0.0001, "reward": 4.275000095367432, "reward_std": 0.28471794724464417, "rewards/accuracy_reward": 2.9750001430511475, "rewards/format_reward": 1.0, "step": 582, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 337.3125, "epoch": 0.892802450229709, "grad_norm": 6.226688787620919, "kl": 0.056884765625, "learning_rate": 2.808671640950927e-08, "loss": 0.0001, "reward": 3.7249999046325684, "reward_std": 0.4804357588291168, "rewards/accuracy_reward": 2.424999952316284, "rewards/format_reward": 1.0, "step": 583, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 372.3125, "epoch": 0.8943338437978561, "grad_norm": 7.851692063224346, "kl": 0.0693359375, "learning_rate": 2.7297302047458058e-08, "loss": 0.0001, "reward": 3.987499952316284, "reward_std": 0.42765170335769653, "rewards/accuracy_reward": 2.6875, "rewards/format_reward": 1.0, "step": 584, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 331.3125, "epoch": 0.8958652373660031, "grad_norm": 5.870969665593799, "kl": 0.060302734375, "learning_rate": 2.6518828776306347e-08, "loss": 0.0001, "reward": 4.237500190734863, "reward_std": 0.3059806525707245, "rewards/accuracy_reward": 2.9375, "rewards/format_reward": 1.0, "step": 585, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 297.40625, "epoch": 0.8973966309341501, "grad_norm": 4.616698971444567, "kl": 0.07958984375, "learning_rate": 2.5751314614455455e-08, "loss": 0.0001, "reward": 4.0625, "reward_std": 0.3738013207912445, "rewards/accuracy_reward": 2.762500047683716, "rewards/format_reward": 1.0, "step": 586, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 307.0, "epoch": 0.8989280245022971, "grad_norm": 4.757707203191272, "kl": 0.072265625, "learning_rate": 2.4994777326648954e-08, "loss": 0.0001, "reward": 4.106249809265137, "reward_std": 0.3416978120803833, "rewards/accuracy_reward": 2.8062500953674316, "rewards/format_reward": 1.0, "step": 587, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 309.8125, "epoch": 0.900459418070444, "grad_norm": 4.265384443593157, "kl": 0.0693359375, "learning_rate": 2.424923442356158e-08, "loss": 0.0001, "reward": 4.674999713897705, "reward_std": 0.3187902867794037, "rewards/accuracy_reward": 3.375000238418579, "rewards/format_reward": 1.0, "step": 588, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 351.15625, "epoch": 0.9019908116385911, "grad_norm": 3.849769706047688, "kl": 0.06494140625, "learning_rate": 2.3514703161394088e-08, "loss": 0.0001, "reward": 3.3562498092651367, "reward_std": 0.5659125447273254, "rewards/accuracy_reward": 2.0562500953674316, "rewards/format_reward": 1.0, "step": 589, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 346.75, "epoch": 0.9035222052067381, "grad_norm": 3.9256866651208986, "kl": 0.06787109375, "learning_rate": 2.279120054147393e-08, "loss": 0.0001, "reward": 3.706249952316284, "reward_std": 0.5209039449691772, "rewards/accuracy_reward": 2.40625, "rewards/format_reward": 1.0, "step": 590, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 371.53125, "epoch": 0.9050535987748851, "grad_norm": 5.095664340886334, "kl": 0.06591796875, "learning_rate": 2.207874330986148e-08, "loss": 0.0001, "reward": 3.4499998092651367, "reward_std": 0.2862982749938965, "rewards/accuracy_reward": 2.1500000953674316, "rewards/format_reward": 1.0, "step": 591, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 327.03125, "epoch": 0.9065849923430321, "grad_norm": 7.617297771827687, "kl": 0.07666015625, "learning_rate": 2.1377347956962556e-08, "loss": 0.0001, "reward": 3.706249952316284, "reward_std": 0.3007799983024597, "rewards/accuracy_reward": 2.406249761581421, "rewards/format_reward": 1.0, "step": 592, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 353.71875, "epoch": 0.9081163859111792, "grad_norm": 5.52079112647666, "kl": 0.06640625, "learning_rate": 2.068703071714678e-08, "loss": 0.0001, "reward": 3.34375, "reward_std": 0.3307170867919922, "rewards/accuracy_reward": 2.043750047683716, "rewards/format_reward": 1.0, "step": 593, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 353.40625, "epoch": 0.9096477794793262, "grad_norm": 8.716888115770072, "kl": 0.05810546875, "learning_rate": 2.0007807568371725e-08, "loss": 0.0001, "reward": 4.125, "reward_std": 0.3458287715911865, "rewards/accuracy_reward": 2.8249998092651367, "rewards/format_reward": 1.0, "step": 594, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 312.78125, "epoch": 0.9111791730474732, "grad_norm": 5.199539573733889, "kl": 0.06884765625, "learning_rate": 1.9339694231813252e-08, "loss": 0.0001, "reward": 3.3687500953674316, "reward_std": 0.4148343503475189, "rewards/accuracy_reward": 2.143749952316284, "rewards/format_reward": 1.0, "step": 595, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 344.125, "epoch": 0.9127105666156202, "grad_norm": 7.552839202353648, "kl": 0.0625, "learning_rate": 1.8682706171501416e-08, "loss": 0.0001, "reward": 3.799999952316284, "reward_std": 0.40122342109680176, "rewards/accuracy_reward": 2.5, "rewards/format_reward": 1.0, "step": 596, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 294.84375, "epoch": 0.9142419601837672, "grad_norm": 9.787298112854378, "kl": 0.06982421875, "learning_rate": 1.80368585939627e-08, "loss": 0.0001, "reward": 3.4250001907348633, "reward_std": 0.4172694683074951, "rewards/accuracy_reward": 2.200000047683716, "rewards/format_reward": 1.0, "step": 597, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 344.125, "epoch": 0.9157733537519143, "grad_norm": 28.605894274862234, "kl": 0.0703125, "learning_rate": 1.7402166447867962e-08, "loss": 0.0001, "reward": 3.5062499046325684, "reward_std": 0.4138874411582947, "rewards/accuracy_reward": 2.206249952316284, "rewards/format_reward": 1.0, "step": 598, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 359.40625, "epoch": 0.9173047473200613, "grad_norm": 4.883960139516868, "kl": 0.06298828125, "learning_rate": 1.6778644423686482e-08, "loss": 0.0001, "reward": 3.4124999046325684, "reward_std": 0.4562169313430786, "rewards/accuracy_reward": 2.112499952316284, "rewards/format_reward": 1.0, "step": 599, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 337.1875, "epoch": 0.9188361408882083, "grad_norm": 10.33792433059585, "kl": 0.0625, "learning_rate": 1.616630695334592e-08, "loss": 0.0001, "reward": 4.493749618530273, "reward_std": 0.5656530857086182, "rewards/accuracy_reward": 3.1937499046325684, "rewards/format_reward": 1.0, "step": 600, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 307.28125, "epoch": 0.9203675344563553, "grad_norm": 13.576771230474968, "kl": 0.0751953125, "learning_rate": 1.5565168209898395e-08, "loss": 0.0001, "reward": 3.5250000953674316, "reward_std": 0.2717602252960205, "rewards/accuracy_reward": 2.299999952316284, "rewards/format_reward": 1.0, "step": 601, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 364.8125, "epoch": 0.9218989280245024, "grad_norm": 10.322156519733802, "kl": 0.06640625, "learning_rate": 1.497524210719203e-08, "loss": 0.0001, "reward": 3.8999998569488525, "reward_std": 0.35277166962623596, "rewards/accuracy_reward": 2.6000001430511475, "rewards/format_reward": 1.0, "step": 602, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 343.125, "epoch": 0.9234303215926493, "grad_norm": 4.515624838691467, "kl": 0.06884765625, "learning_rate": 1.4396542299549563e-08, "loss": 0.0001, "reward": 3.6500000953674316, "reward_std": 0.3364337384700775, "rewards/accuracy_reward": 2.3499999046325684, "rewards/format_reward": 1.0, "step": 603, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 328.6875, "epoch": 0.9249617151607963, "grad_norm": 23.351233380824027, "kl": 0.0654296875, "learning_rate": 1.3829082181451624e-08, "loss": 0.0001, "reward": 4.300000190734863, "reward_std": 0.29585695266723633, "rewards/accuracy_reward": 3.0, "rewards/format_reward": 1.0, "step": 604, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 333.4375, "epoch": 0.9264931087289433, "grad_norm": 10.101075291421978, "kl": 0.068359375, "learning_rate": 1.3272874887227281e-08, "loss": 0.0001, "reward": 3.2125000953674316, "reward_std": 0.44710811972618103, "rewards/accuracy_reward": 1.912500023841858, "rewards/format_reward": 1.0, "step": 605, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 350.1875, "epoch": 0.9280245022970903, "grad_norm": 5.202766658176163, "kl": 0.06494140625, "learning_rate": 1.2727933290749615e-08, "loss": 0.0001, "reward": 4.256249904632568, "reward_std": 0.2566061019897461, "rewards/accuracy_reward": 3.03125, "rewards/format_reward": 1.0, "step": 606, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 319.78125, "epoch": 0.9295558958652373, "grad_norm": 6.1359910508190385, "kl": 0.0703125, "learning_rate": 1.2194270005137953e-08, "loss": 0.0001, "reward": 3.3999998569488525, "reward_std": 0.26921939849853516, "rewards/accuracy_reward": 2.174999713897705, "rewards/format_reward": 1.0, "step": 607, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 316.46875, "epoch": 0.9310872894333844, "grad_norm": 6.319370371879856, "kl": 0.06640625, "learning_rate": 1.1671897382465878e-08, "loss": 0.0001, "reward": 3.8249998092651367, "reward_std": 0.4702260196208954, "rewards/accuracy_reward": 2.5250000953674316, "rewards/format_reward": 1.0, "step": 608, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 301.4375, "epoch": 0.9326186830015314, "grad_norm": 5.090879708247038, "kl": 0.06884765625, "learning_rate": 1.1160827513475468e-08, "loss": 0.0001, "reward": 3.90625, "reward_std": 0.4447071850299835, "rewards/accuracy_reward": 2.606250047683716, "rewards/format_reward": 1.0, "step": 609, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 306.21875, "epoch": 0.9341500765696784, "grad_norm": 5.687037019839013, "kl": 0.0712890625, "learning_rate": 1.066107222729712e-08, "loss": 0.0001, "reward": 3.625, "reward_std": 0.3328478932380676, "rewards/accuracy_reward": 2.3249998092651367, "rewards/format_reward": 1.0, "step": 610, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 303.90625, "epoch": 0.9356814701378254, "grad_norm": 6.2524293242086815, "kl": 0.07421875, "learning_rate": 1.0172643091176104e-08, "loss": 0.0001, "reward": 3.4124999046325684, "reward_std": 0.4472746253013611, "rewards/accuracy_reward": 2.112499952316284, "rewards/format_reward": 1.0, "step": 611, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 319.21875, "epoch": 0.9372128637059725, "grad_norm": 16.203473126784306, "kl": 0.07080078125, "learning_rate": 9.695551410204506e-09, "loss": 0.0001, "reward": 3.8125, "reward_std": 0.5833103060722351, "rewards/accuracy_reward": 2.512500047683716, "rewards/format_reward": 1.0, "step": 612, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 332.09375, "epoch": 0.9387442572741195, "grad_norm": 4.888634913199968, "kl": 0.0751953125, "learning_rate": 9.229808227059876e-09, "loss": 0.0001, "reward": 3.4124999046325684, "reward_std": 0.289096474647522, "rewards/accuracy_reward": 2.1125001907348633, "rewards/format_reward": 1.0, "step": 613, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.34375, "epoch": 0.9402756508422665, "grad_norm": 7.009140647685338, "kl": 0.076171875, "learning_rate": 8.775424321749381e-09, "loss": 0.0001, "reward": 2.9437499046325684, "reward_std": 0.25198203325271606, "rewards/accuracy_reward": 1.6437499523162842, "rewards/format_reward": 1.0, "step": 614, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 361.59375, "epoch": 0.9418070444104135, "grad_norm": 5.095209828668755, "kl": 0.06689453125, "learning_rate": 8.332410211360608e-09, "loss": 0.0001, "reward": 3.53125, "reward_std": 0.2518140375614166, "rewards/accuracy_reward": 2.2312498092651367, "rewards/format_reward": 1.0, "step": 615, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 337.09375, "epoch": 0.9433384379785605, "grad_norm": 5.274059040607986, "kl": 0.0703125, "learning_rate": 7.900776149817712e-09, "loss": 0.0001, "reward": 4.0, "reward_std": 0.5040473937988281, "rewards/accuracy_reward": 2.6999998092651367, "rewards/format_reward": 1.0, "step": 616, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 327.59375, "epoch": 0.9448698315467075, "grad_norm": 6.827469118684358, "kl": 0.06640625, "learning_rate": 7.480532127644435e-09, "loss": 0.0001, "reward": 4.274999618530273, "reward_std": 0.5195462703704834, "rewards/accuracy_reward": 3.049999952316284, "rewards/format_reward": 1.0, "step": 617, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 333.40625, "epoch": 0.9464012251148545, "grad_norm": 8.68679701533234, "kl": 0.0673828125, "learning_rate": 7.071687871732512e-09, "loss": 0.0001, "reward": 3.424999713897705, "reward_std": 0.2530859112739563, "rewards/accuracy_reward": 2.125, "rewards/format_reward": 1.0, "step": 618, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 354.8125, "epoch": 0.9479326186830015, "grad_norm": 6.14507875298798, "kl": 0.0673828125, "learning_rate": 6.6742528451171895e-09, "loss": 0.0001, "reward": 2.8312501907348633, "reward_std": 0.34376227855682373, "rewards/accuracy_reward": 1.6062500476837158, "rewards/format_reward": 1.0, "step": 619, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 320.25, "epoch": 0.9494640122511485, "grad_norm": 5.882825033336854, "kl": 0.068359375, "learning_rate": 6.288236246757284e-09, "loss": 0.0001, "reward": 4.118749618530273, "reward_std": 0.3603181540966034, "rewards/accuracy_reward": 2.8187499046325684, "rewards/format_reward": 1.0, "step": 620, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 345.71875, "epoch": 0.9509954058192955, "grad_norm": 15.156974160837702, "kl": 0.064453125, "learning_rate": 5.913647011323075e-09, "loss": 0.0001, "reward": 3.174999713897705, "reward_std": 0.2598288655281067, "rewards/accuracy_reward": 1.875, "rewards/format_reward": 1.0, "step": 621, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 297.40625, "epoch": 0.9525267993874426, "grad_norm": 3.840998936877307, "kl": 0.07275390625, "learning_rate": 5.5504938089890316e-09, "loss": 0.0001, "reward": 3.799999952316284, "reward_std": 0.34373825788497925, "rewards/accuracy_reward": 2.499999761581421, "rewards/format_reward": 1.0, "step": 622, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 347.78125, "epoch": 0.9540581929555896, "grad_norm": 7.652011405321351, "kl": 0.07177734375, "learning_rate": 5.198785045233245e-09, "loss": 0.0001, "reward": 3.4124999046325684, "reward_std": 0.43421119451522827, "rewards/accuracy_reward": 2.112499952316284, "rewards/format_reward": 1.0, "step": 623, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 310.53125, "epoch": 0.9555895865237366, "grad_norm": 4.331709203711526, "kl": 0.0751953125, "learning_rate": 4.85852886064303e-09, "loss": 0.0001, "reward": 3.6875, "reward_std": 0.3671942353248596, "rewards/accuracy_reward": 2.4625000953674316, "rewards/format_reward": 1.0, "step": 624, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 355.71875, "epoch": 0.9571209800918836, "grad_norm": 4.783970405387704, "kl": 0.07080078125, "learning_rate": 4.529733130726299e-09, "loss": 0.0001, "reward": 3.2562499046325684, "reward_std": 0.37546029686927795, "rewards/accuracy_reward": 1.9562499523162842, "rewards/format_reward": 1.0, "step": 625, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 361.0, "epoch": 0.9586523736600306, "grad_norm": 11.829435512309791, "kl": 0.0791015625, "learning_rate": 4.2124054657293184e-09, "loss": 0.0001, "reward": 3.4749999046325684, "reward_std": 0.5535058975219727, "rewards/accuracy_reward": 2.25, "rewards/format_reward": 1.0, "step": 626, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 306.9375, "epoch": 0.9601837672281777, "grad_norm": 10.714822722401369, "kl": 0.07861328125, "learning_rate": 3.9065532104607946e-09, "loss": 0.0001, "reward": 4.012499809265137, "reward_std": 0.4494459629058838, "rewards/accuracy_reward": 2.7125000953674316, "rewards/format_reward": 1.0, "step": 627, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 384.875, "epoch": 0.9617151607963247, "grad_norm": 7.244466933322755, "kl": 0.064453125, "learning_rate": 3.6121834441213416e-09, "loss": 0.0001, "reward": 3.8125, "reward_std": 0.40354323387145996, "rewards/accuracy_reward": 2.512500047683716, "rewards/format_reward": 1.0, "step": 628, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 311.96875, "epoch": 0.9632465543644717, "grad_norm": 4.438103763095202, "kl": 0.07177734375, "learning_rate": 3.3293029801403917e-09, "loss": 0.0001, "reward": 3.2249999046325684, "reward_std": 0.3224777579307556, "rewards/accuracy_reward": 1.9249999523162842, "rewards/format_reward": 1.0, "step": 629, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 323.96875, "epoch": 0.9647779479326187, "grad_norm": 4.66747144008146, "kl": 0.06494140625, "learning_rate": 3.0579183660177086e-09, "loss": 0.0001, "reward": 3.6624999046325684, "reward_std": 0.554044246673584, "rewards/accuracy_reward": 2.4375, "rewards/format_reward": 1.0, "step": 630, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 365.90625, "epoch": 0.9663093415007658, "grad_norm": 7.001370987648456, "kl": 0.06298828125, "learning_rate": 2.7980358831724004e-09, "loss": 0.0001, "reward": 4.1875, "reward_std": 0.44796818494796753, "rewards/accuracy_reward": 2.887500286102295, "rewards/format_reward": 1.0, "step": 631, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 383.46875, "epoch": 0.9678407350689127, "grad_norm": 5.139795499775845, "kl": 0.0673828125, "learning_rate": 2.549661546797255e-09, "loss": 0.0001, "reward": 3.6374998092651367, "reward_std": 0.29774677753448486, "rewards/accuracy_reward": 2.3374998569488525, "rewards/format_reward": 1.0, "step": 632, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 337.34375, "epoch": 0.9693721286370597, "grad_norm": 13.267348596865721, "kl": 0.0830078125, "learning_rate": 2.312801105719575e-09, "loss": 0.0001, "reward": 3.9124999046325684, "reward_std": 0.34188583493232727, "rewards/accuracy_reward": 2.612499952316284, "rewards/format_reward": 1.0, "step": 633, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 367.21875, "epoch": 0.9709035222052067, "grad_norm": 6.861770207450841, "kl": 0.05908203125, "learning_rate": 2.0874600422682297e-09, "loss": 0.0001, "reward": 2.9812498092651367, "reward_std": 0.36701488494873047, "rewards/accuracy_reward": 1.681249976158142, "rewards/format_reward": 1.0, "step": 634, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 322.34375, "epoch": 0.9724349157733537, "grad_norm": 3.549048561492334, "kl": 0.078125, "learning_rate": 1.8736435721465326e-09, "loss": 0.0001, "reward": 3.456249713897705, "reward_std": 0.2611098885536194, "rewards/accuracy_reward": 2.15625, "rewards/format_reward": 1.0, "step": 635, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 318.78125, "epoch": 0.9739663093415007, "grad_norm": 11.650883201241767, "kl": 0.07568359375, "learning_rate": 1.6713566443117832e-09, "loss": 0.0001, "reward": 3.9749999046325684, "reward_std": 0.493512898683548, "rewards/accuracy_reward": 2.674999952316284, "rewards/format_reward": 1.0, "step": 636, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 334.25, "epoch": 0.9754977029096478, "grad_norm": 6.177634684168555, "kl": 0.06884765625, "learning_rate": 1.4806039408604699e-09, "loss": 0.0001, "reward": 3.075000047683716, "reward_std": 0.2664228677749634, "rewards/accuracy_reward": 1.8499999046325684, "rewards/format_reward": 1.0, "step": 637, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 335.34375, "epoch": 0.9770290964777948, "grad_norm": 13.392275647815515, "kl": 0.0712890625, "learning_rate": 1.3013898769200783e-09, "loss": 0.0001, "reward": 3.5687499046325684, "reward_std": 0.3499985337257385, "rewards/accuracy_reward": 2.2687501907348633, "rewards/format_reward": 1.0, "step": 638, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 308.75, "epoch": 0.9785604900459418, "grad_norm": 6.08506315401085, "kl": 0.06982421875, "learning_rate": 1.1337186005467846e-09, "loss": 0.0001, "reward": 3.7249999046325684, "reward_std": 0.30260762572288513, "rewards/accuracy_reward": 2.424999713897705, "rewards/format_reward": 1.0, "step": 639, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 395.34375, "epoch": 0.9800918836140888, "grad_norm": 6.627925742618377, "kl": 0.057373046875, "learning_rate": 9.775939926296439e-10, "loss": 0.0001, "reward": 3.34375, "reward_std": 0.4554150104522705, "rewards/accuracy_reward": 2.043750047683716, "rewards/format_reward": 1.0, "step": 640, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 287.5625, "epoch": 0.9816232771822359, "grad_norm": 4.4909176339040195, "kl": 0.07666015625, "learning_rate": 8.33019666800383e-10, "loss": 0.0001, "reward": 4.512500286102295, "reward_std": 0.5600020885467529, "rewards/accuracy_reward": 3.2124998569488525, "rewards/format_reward": 1.0, "step": 641, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 325.09375, "epoch": 0.9831546707503829, "grad_norm": 5.439886025150774, "kl": 0.06884765625, "learning_rate": 6.999989693501906e-10, "loss": 0.0001, "reward": 4.0625, "reward_std": 0.43934160470962524, "rewards/accuracy_reward": 2.7624998092651367, "rewards/format_reward": 1.0, "step": 642, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 351.0625, "epoch": 0.9846860643185299, "grad_norm": 3.729869183229773, "kl": 0.07080078125, "learning_rate": 5.785349791520011e-10, "loss": 0.0001, "reward": 3.34375, "reward_std": 0.64092618227005, "rewards/accuracy_reward": 2.1187500953674316, "rewards/format_reward": 1.0, "step": 643, "temporal_rewards": 0.75 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 343.8125, "epoch": 0.9862174578866769, "grad_norm": 7.5174674414090035, "kl": 0.0693359375, "learning_rate": 4.686305075892738e-10, "loss": 0.0001, "reward": 4.206249713897705, "reward_std": 0.3077024221420288, "rewards/accuracy_reward": 2.90625, "rewards/format_reward": 1.0, "step": 644, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 296.71875, "epoch": 0.9877488514548239, "grad_norm": 10.685241592241573, "kl": 0.0791015625, "learning_rate": 3.7028809849098954e-10, "loss": 0.0001, "reward": 3.262500047683716, "reward_std": 0.3571416139602661, "rewards/accuracy_reward": 1.962499976158142, "rewards/format_reward": 1.0, "step": 645, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 326.71875, "epoch": 0.9892802450229708, "grad_norm": 8.356539394023253, "kl": 0.0712890625, "learning_rate": 2.835100280726976e-10, "loss": 0.0001, "reward": 3.5625, "reward_std": 0.4336293935775757, "rewards/accuracy_reward": 2.2624998092651367, "rewards/format_reward": 1.0, "step": 646, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 300.8125, "epoch": 0.9908116385911179, "grad_norm": 20.49471176250662, "kl": 0.06787109375, "learning_rate": 2.0829830488389154e-10, "loss": 0.0001, "reward": 3.5250000953674316, "reward_std": 0.5362038612365723, "rewards/accuracy_reward": 2.375, "rewards/format_reward": 1.0, "step": 647, "temporal_rewards": 0.5 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 305.8125, "epoch": 0.9923430321592649, "grad_norm": 5.603911872872956, "kl": 0.080078125, "learning_rate": 1.446546697614903e-10, "loss": 0.0001, "reward": 4.118750095367432, "reward_std": 0.46842336654663086, "rewards/accuracy_reward": 2.8187499046325684, "rewards/format_reward": 1.0, "step": 648, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 287.78125, "epoch": 0.9938744257274119, "grad_norm": 4.003878671046886, "kl": 0.07763671875, "learning_rate": 9.258059578948207e-11, "loss": 0.0001, "reward": 3.356250047683716, "reward_std": 0.25471654534339905, "rewards/accuracy_reward": 2.0562500953674316, "rewards/format_reward": 1.0, "step": 649, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 340.4375, "epoch": 0.9954058192955589, "grad_norm": 4.5992490785580795, "kl": 0.072265625, "learning_rate": 5.2077288264951166e-11, "loss": 0.0001, "reward": 3.0875000953674316, "reward_std": 0.4648074507713318, "rewards/accuracy_reward": 1.787500023841858, "rewards/format_reward": 1.0, "step": 650, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 375.40625, "epoch": 0.996937212863706, "grad_norm": 6.9796469597999735, "kl": 0.0732421875, "learning_rate": 2.3145684670100583e-11, "loss": 0.0001, "reward": 3.637500047683716, "reward_std": 0.42184919118881226, "rewards/accuracy_reward": 2.3375000953674316, "rewards/format_reward": 1.0, "step": 651, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 379.875, "epoch": 0.998468606431853, "grad_norm": 4.574612761450238, "kl": 0.06494140625, "learning_rate": 5.786454650602568e-12, "loss": 0.0001, "reward": 3.2749998569488525, "reward_std": 0.20265839993953705, "rewards/accuracy_reward": 1.975000023841858, "rewards/format_reward": 1.0, "step": 652, "temporal_rewards": 1.0 }, { "all_correct": 1.0, "all_wrong": 0.0, "completion_length": 319.75, "epoch": 1.0, "grad_norm": 6.773732480754798, "kl": 0.07080078125, "learning_rate": 0.0, "loss": 0.0001, "reward": 3.28125, "reward_std": 0.394453227519989, "rewards/accuracy_reward": 1.9812499284744263, "rewards/format_reward": 1.0, "step": 653, "temporal_rewards": 1.0 } ], "logging_steps": 1.0, "max_steps": 653, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }