| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.6928406466512702, | |
| "eval_steps": 500, | |
| "global_step": 150, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 107.87500381469727, | |
| "epoch": 0.004618937644341801, | |
| "grad_norm": 10.236255645751953, | |
| "kl": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "reward": 1.8541667461395264, | |
| "reward_std": 0.1378917135298252, | |
| "rewards/accuracy_reward": 0.8541666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 96.77083587646484, | |
| "epoch": 0.009237875288683603, | |
| "grad_norm": 5.244990348815918, | |
| "kl": 0.00017547607421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 1.7708333730697632, | |
| "reward_std": 0.17311252653598785, | |
| "rewards/accuracy_reward": 0.7708333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 120.25, | |
| "epoch": 0.013856812933025405, | |
| "grad_norm": 4.743775367736816, | |
| "kl": 0.0006608963012695312, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 1.7916666865348816, | |
| "reward_std": 0.179558377712965, | |
| "rewards/accuracy_reward": 0.7916666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 94.3125, | |
| "epoch": 0.018475750577367205, | |
| "grad_norm": 3.1872682571411133, | |
| "kl": 0.0009775161743164062, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 1.8125000596046448, | |
| "reward_std": 0.1378917098045349, | |
| "rewards/accuracy_reward": 0.8125000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 76.95833587646484, | |
| "epoch": 0.023094688221709007, | |
| "grad_norm": 5.366634368896484, | |
| "kl": 0.00531768798828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.8750000596046448, | |
| "reward_std": 0.1666666716337204, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 0.9583333730697632, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 100.64583587646484, | |
| "epoch": 0.02771362586605081, | |
| "grad_norm": 3.2625043392181396, | |
| "kl": 0.0061187744140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.8958333730697632, | |
| "reward_std": 0.17311251163482666, | |
| "rewards/accuracy_reward": 0.8958333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 100.83333587646484, | |
| "epoch": 0.03233256351039261, | |
| "grad_norm": 2.418282985687256, | |
| "kl": 0.00472259521484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.7708333730697632, | |
| "reward_std": 0.17311252281069756, | |
| "rewards/accuracy_reward": 0.7708333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 111.625, | |
| "epoch": 0.03695150115473441, | |
| "grad_norm": 8.21818733215332, | |
| "kl": 0.003204345703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.8750000596046448, | |
| "reward_std": 0.13144585862755775, | |
| "rewards/accuracy_reward": 0.8750000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 93.5, | |
| "epoch": 0.04157043879907621, | |
| "grad_norm": 4.702052593231201, | |
| "kl": 0.008941650390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.8125000596046448, | |
| "reward_std": 0.25644585490226746, | |
| "rewards/accuracy_reward": 0.8125000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 148.33333587646484, | |
| "epoch": 0.046189376443418015, | |
| "grad_norm": 2.0280299186706543, | |
| "kl": 0.002277374267578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.8125000596046448, | |
| "reward_std": 0.18600423261523247, | |
| "rewards/accuracy_reward": 0.8125000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 76.89583587646484, | |
| "epoch": 0.050808314087759814, | |
| "grad_norm": 3.9683003425598145, | |
| "kl": 0.003360748291015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.8541667461395264, | |
| "reward_std": 0.1250000037252903, | |
| "rewards/accuracy_reward": 0.8541666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 96.02083587646484, | |
| "epoch": 0.05542725173210162, | |
| "grad_norm": 10.515838623046875, | |
| "kl": 0.01108551025390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.7083333730697632, | |
| "reward_std": 0.29811252653598785, | |
| "rewards/accuracy_reward": 0.7083333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 75.54166793823242, | |
| "epoch": 0.06004618937644342, | |
| "grad_norm": 4.704844951629639, | |
| "kl": 0.0057830810546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.8958333730697632, | |
| "reward_std": 0.1250000037252903, | |
| "rewards/accuracy_reward": 0.8958333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 100.41667175292969, | |
| "epoch": 0.06466512702078522, | |
| "grad_norm": 7.272250175476074, | |
| "kl": 0.005889892578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.7916667461395264, | |
| "reward_std": 0.25, | |
| "rewards/accuracy_reward": 0.7916666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 104.33333587646484, | |
| "epoch": 0.06928406466512702, | |
| "grad_norm": 3.6556761264801025, | |
| "kl": 0.0079803466796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.9375000596046448, | |
| "reward_std": 0.1250000037252903, | |
| "rewards/accuracy_reward": 0.9375000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 94.54167175292969, | |
| "epoch": 0.07390300230946882, | |
| "grad_norm": 4.878111362457275, | |
| "kl": 0.00466156005859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.7291666865348816, | |
| "reward_std": 0.17311252281069756, | |
| "rewards/accuracy_reward": 0.7291666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 69.5625, | |
| "epoch": 0.07852193995381063, | |
| "grad_norm": 11.191930770874023, | |
| "kl": 0.0043792724609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.8125000596046448, | |
| "reward_std": 0.2212250456213951, | |
| "rewards/accuracy_reward": 0.8125000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 89.70833396911621, | |
| "epoch": 0.08314087759815242, | |
| "grad_norm": 3.1972787380218506, | |
| "kl": 0.0032033920288085938, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.8541666865348816, | |
| "reward_std": 0.2212250456213951, | |
| "rewards/accuracy_reward": 0.8541666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 81.89583587646484, | |
| "epoch": 0.08775981524249422, | |
| "grad_norm": 30.53135108947754, | |
| "kl": 0.007843017578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.9375000596046448, | |
| "reward_std": 0.1250000037252903, | |
| "rewards/accuracy_reward": 0.9375000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 115.5625057220459, | |
| "epoch": 0.09237875288683603, | |
| "grad_norm": 2.5540764331817627, | |
| "kl": 0.0017900466918945312, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.7916666865348816, | |
| "reward_std": 0.2500000111758709, | |
| "rewards/accuracy_reward": 0.7916666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 107.08333587646484, | |
| "epoch": 0.09699769053117784, | |
| "grad_norm": 1.230578899383545, | |
| "kl": 0.0011348724365234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 1.8750000596046448, | |
| "reward_std": 0.1666666716337204, | |
| "rewards/accuracy_reward": 0.8750000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 46.52083396911621, | |
| "epoch": 0.10161662817551963, | |
| "grad_norm": 6.826656341552734, | |
| "kl": 0.02459096908569336, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.8958333730697632, | |
| "reward_std": 0.2083333432674408, | |
| "rewards/accuracy_reward": 0.8958333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 79.6875, | |
| "epoch": 0.10623556581986143, | |
| "grad_norm": 6.442953109741211, | |
| "kl": 0.004077911376953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.8125000596046448, | |
| "reward_std": 0.3045583665370941, | |
| "rewards/accuracy_reward": 0.8333333730697632, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 80.70833587646484, | |
| "epoch": 0.11085450346420324, | |
| "grad_norm": 9.262118339538574, | |
| "kl": 0.0145263671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.7708333730697632, | |
| "reward_std": 0.25644584745168686, | |
| "rewards/accuracy_reward": 0.7708333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 72.6875, | |
| "epoch": 0.11547344110854503, | |
| "grad_norm": 2.5963733196258545, | |
| "kl": 0.019866943359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.8750000596046448, | |
| "reward_std": 0.0833333358168602, | |
| "rewards/accuracy_reward": 0.8750000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 79.37500190734863, | |
| "epoch": 0.12009237875288684, | |
| "grad_norm": 8.336856842041016, | |
| "kl": 0.02008056640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.6875000596046448, | |
| "reward_std": 0.25644585117697716, | |
| "rewards/accuracy_reward": 0.6875000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 54.64583396911621, | |
| "epoch": 0.12471131639722864, | |
| "grad_norm": 7.557518482208252, | |
| "kl": 0.01947021484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.8125, | |
| "reward_std": 0.2083333358168602, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 57.4375, | |
| "epoch": 0.12933025404157045, | |
| "grad_norm": 4.195393085479736, | |
| "kl": 0.004122734069824219, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9583333730697632, | |
| "reward_std": 0.0833333358168602, | |
| "rewards/accuracy_reward": 0.9583333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 31.750000953674316, | |
| "epoch": 0.13394919168591224, | |
| "grad_norm": 8.026168823242188, | |
| "kl": 0.0205078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.8333333730697632, | |
| "reward_std": 0.0833333358168602, | |
| "rewards/accuracy_reward": 0.8333333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 63.0625, | |
| "epoch": 0.13856812933025403, | |
| "grad_norm": 9.261226654052734, | |
| "kl": 0.01236724853515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "reward": 1.875, | |
| "reward_std": 0.048112522810697556, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 81.93750190734863, | |
| "epoch": 0.14318706697459585, | |
| "grad_norm": 8.847457885742188, | |
| "kl": 0.01412200927734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.8750000596046448, | |
| "reward_std": 0.179558377712965, | |
| "rewards/accuracy_reward": 0.8958333730697632, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 63.125, | |
| "epoch": 0.14780600461893764, | |
| "grad_norm": 9.623985290527344, | |
| "kl": 0.0164337158203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.8333333730697632, | |
| "reward_std": 0.2628917098045349, | |
| "rewards/accuracy_reward": 0.8333333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 65.35416793823242, | |
| "epoch": 0.15242494226327943, | |
| "grad_norm": 4.50661563873291, | |
| "kl": 0.01575469970703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.6875000596046448, | |
| "reward_std": 0.2083333358168602, | |
| "rewards/accuracy_reward": 0.6875000149011612, | |
| "rewards/format_reward": 1.0, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 65.10416793823242, | |
| "epoch": 0.15704387990762125, | |
| "grad_norm": 8.077963829040527, | |
| "kl": 0.06109619140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8750000596046448, | |
| "reward_std": 0.2500000074505806, | |
| "rewards/accuracy_reward": 0.8750000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 74.45833587646484, | |
| "epoch": 0.16166281755196305, | |
| "grad_norm": 9.616883277893066, | |
| "kl": 0.00418853759765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9166666865348816, | |
| "reward_std": 0.0833333358168602, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 56.16666793823242, | |
| "epoch": 0.16628175519630484, | |
| "grad_norm": 8.547237396240234, | |
| "kl": 0.014141082763671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.875, | |
| "reward_std": 0.1666666716337204, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 97.93750381469727, | |
| "epoch": 0.17090069284064666, | |
| "grad_norm": 0.9761996865272522, | |
| "kl": 0.01053619384765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.9166666865348816, | |
| "reward_std": 0.1666666679084301, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 101.79166793823242, | |
| "epoch": 0.17551963048498845, | |
| "grad_norm": 5.636431694030762, | |
| "kl": 0.0035552978515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.8541667461395264, | |
| "reward_std": 0.2212250456213951, | |
| "rewards/accuracy_reward": 0.8541666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 50.29166793823242, | |
| "epoch": 0.18013856812933027, | |
| "grad_norm": 5.020724773406982, | |
| "kl": 0.006103515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.8750000596046448, | |
| "reward_std": 0.179558377712965, | |
| "rewards/accuracy_reward": 0.8750000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 68.02083587646484, | |
| "epoch": 0.18475750577367206, | |
| "grad_norm": 1.0703792572021484, | |
| "kl": 0.0052642822265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9166667461395264, | |
| "reward_std": 0.13144585862755775, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 75.22916984558105, | |
| "epoch": 0.18937644341801385, | |
| "grad_norm": 2.469010829925537, | |
| "kl": 0.0019931793212890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.8958333730697632, | |
| "reward_std": 0.08977919071912766, | |
| "rewards/accuracy_reward": 0.8958333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 75.1875, | |
| "epoch": 0.19399538106235567, | |
| "grad_norm": 4.2055230140686035, | |
| "kl": 0.003253936767578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.9375000596046448, | |
| "reward_std": 0.08977919071912766, | |
| "rewards/accuracy_reward": 0.9375000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 79.54166793823242, | |
| "epoch": 0.19861431870669746, | |
| "grad_norm": 7.644351482391357, | |
| "kl": 0.0106658935546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.875, | |
| "reward_std": 0.25, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 81.04166793823242, | |
| "epoch": 0.20323325635103925, | |
| "grad_norm": 0.7677356600761414, | |
| "kl": 0.00148773193359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.8750000596046448, | |
| "reward_std": 0.21477919444441795, | |
| "rewards/accuracy_reward": 0.8750000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 62.770835876464844, | |
| "epoch": 0.20785219399538107, | |
| "grad_norm": 4.3740692138671875, | |
| "kl": 0.002414703369140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.8750000596046448, | |
| "reward_std": 0.1666666716337204, | |
| "rewards/accuracy_reward": 0.8750000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 78.35416793823242, | |
| "epoch": 0.21247113163972287, | |
| "grad_norm": 4.38349723815918, | |
| "kl": 0.005157470703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9375000596046448, | |
| "reward_std": 0.1250000037252903, | |
| "rewards/accuracy_reward": 0.9375000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 59.9375, | |
| "epoch": 0.21709006928406466, | |
| "grad_norm": 0.012896657921373844, | |
| "kl": 0.001483917236328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 63.520835876464844, | |
| "epoch": 0.22170900692840648, | |
| "grad_norm": 1.7839175462722778, | |
| "kl": 0.0340423583984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 52.41666793823242, | |
| "epoch": 0.22632794457274827, | |
| "grad_norm": 15.218417167663574, | |
| "kl": 0.0085296630859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.875, | |
| "reward_std": 0.0833333358168602, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 42.12500286102295, | |
| "epoch": 0.23094688221709006, | |
| "grad_norm": 0.7014623284339905, | |
| "kl": 0.001987457275390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.9583333730697632, | |
| "reward_std": 0.0833333358168602, | |
| "rewards/accuracy_reward": 0.9583333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 48.666666984558105, | |
| "epoch": 0.23556581986143188, | |
| "grad_norm": 0.49798333644866943, | |
| "kl": 0.0025177001953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.9375, | |
| "reward_std": 0.08977919071912766, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 67.47916793823242, | |
| "epoch": 0.24018475750577367, | |
| "grad_norm": 0.28330135345458984, | |
| "kl": 0.00286102294921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.9583333730697632, | |
| "reward_std": 0.048112522810697556, | |
| "rewards/accuracy_reward": 0.9583333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 58.020835876464844, | |
| "epoch": 0.24480369515011546, | |
| "grad_norm": 4.528405666351318, | |
| "kl": 0.004150390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.8958333730697632, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.8958333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 47.91666793823242, | |
| "epoch": 0.24942263279445728, | |
| "grad_norm": 0.3298056125640869, | |
| "kl": 0.00391387939453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 33.47916793823242, | |
| "epoch": 0.2540415704387991, | |
| "grad_norm": 0.016688158735632896, | |
| "kl": 0.00406646728515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 55.979169845581055, | |
| "epoch": 0.2586605080831409, | |
| "grad_norm": 0.3446481227874756, | |
| "kl": 0.004150390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 31.604167938232422, | |
| "epoch": 0.2632794457274827, | |
| "grad_norm": 4.186825275421143, | |
| "kl": 0.0166015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.8750000596046448, | |
| "reward_std": 0.0833333358168602, | |
| "rewards/accuracy_reward": 0.8750000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 54.708335876464844, | |
| "epoch": 0.2678983833718245, | |
| "grad_norm": 0.45439592003822327, | |
| "kl": 0.005859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.8958333730697632, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.8958333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 44.14583396911621, | |
| "epoch": 0.27251732101616627, | |
| "grad_norm": 0.3110947012901306, | |
| "kl": 0.01617431640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.9583333730697632, | |
| "reward_std": 0.048112522810697556, | |
| "rewards/accuracy_reward": 0.9583333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 56.56250190734863, | |
| "epoch": 0.27713625866050806, | |
| "grad_norm": 4.412627220153809, | |
| "kl": 0.00714111328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.8750000596046448, | |
| "reward_std": 0.1666666716337204, | |
| "rewards/accuracy_reward": 0.8750000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 24.52083396911621, | |
| "epoch": 0.2817551963048499, | |
| "grad_norm": 12.474504470825195, | |
| "kl": 0.02862548828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.7500000596046448, | |
| "reward_std": 0.0833333358168602, | |
| "rewards/accuracy_reward": 0.7500000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 42.08333396911621, | |
| "epoch": 0.2863741339491917, | |
| "grad_norm": 6.133255481719971, | |
| "kl": 0.02166748046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.9375, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 32.33333396911621, | |
| "epoch": 0.2909930715935335, | |
| "grad_norm": 18.329042434692383, | |
| "kl": 0.01953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.8958333730697632, | |
| "reward_std": 0.1250000037252903, | |
| "rewards/accuracy_reward": 0.8958333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 41.87500190734863, | |
| "epoch": 0.2956120092378753, | |
| "grad_norm": 0.8285086750984192, | |
| "kl": 0.00627899169921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.8333333730697632, | |
| "reward_std": 0.1666666679084301, | |
| "rewards/accuracy_reward": 0.8333333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 41.395835876464844, | |
| "epoch": 0.3002309468822171, | |
| "grad_norm": 12.128033638000488, | |
| "kl": 0.01610565185546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.9375, | |
| "reward_std": 0.125, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 51.145835876464844, | |
| "epoch": 0.30484988452655887, | |
| "grad_norm": 0.4265430271625519, | |
| "kl": 0.0070648193359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.9583333730697632, | |
| "reward_std": 0.0833333358168602, | |
| "rewards/accuracy_reward": 0.9583333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 43.50000190734863, | |
| "epoch": 0.3094688221709007, | |
| "grad_norm": 5.070745468139648, | |
| "kl": 0.0081787109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.8958333730697632, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.8958333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 42.604166984558105, | |
| "epoch": 0.3140877598152425, | |
| "grad_norm": 8.272510528564453, | |
| "kl": 0.01300048828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "reward": 1.8333333730697632, | |
| "reward_std": 0.1666666716337204, | |
| "rewards/accuracy_reward": 0.8333333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 45.33333396911621, | |
| "epoch": 0.3187066974595843, | |
| "grad_norm": 0.4940468668937683, | |
| "kl": 0.00400543212890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9583333730697632, | |
| "reward_std": 0.0833333358168602, | |
| "rewards/accuracy_reward": 0.9583333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 34.70833492279053, | |
| "epoch": 0.3233256351039261, | |
| "grad_norm": 26.54204750061035, | |
| "kl": 0.02557373046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.8541666865348816, | |
| "reward_std": 0.1250000037252903, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 18.750000953674316, | |
| "epoch": 0.3279445727482679, | |
| "grad_norm": 14.518416404724121, | |
| "kl": 0.00531005859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9375000596046448, | |
| "reward_std": 0.08977919071912766, | |
| "rewards/accuracy_reward": 0.9375000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 28.541666984558105, | |
| "epoch": 0.3325635103926097, | |
| "grad_norm": 9.401073455810547, | |
| "kl": 0.004241943359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9375, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 33.47916793823242, | |
| "epoch": 0.3371824480369515, | |
| "grad_norm": 0.5438477396965027, | |
| "kl": 0.00969696044921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 28.625000953674316, | |
| "epoch": 0.3418013856812933, | |
| "grad_norm": 12.923347473144531, | |
| "kl": 0.006134033203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.8541666865348816, | |
| "reward_std": 0.08977919071912766, | |
| "rewards/accuracy_reward": 0.8541666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 38.875, | |
| "epoch": 0.3464203233256351, | |
| "grad_norm": 0.082447350025177, | |
| "kl": 0.0061798095703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 32.291666984558105, | |
| "epoch": 0.3510392609699769, | |
| "grad_norm": 6.839282989501953, | |
| "kl": 0.02349853515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.8750000596046448, | |
| "reward_std": 0.0833333358168602, | |
| "rewards/accuracy_reward": 0.8750000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 29.812500953674316, | |
| "epoch": 0.3556581986143187, | |
| "grad_norm": 0.014031085185706615, | |
| "kl": 0.0072021484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.9166666865348816, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 19.812500476837158, | |
| "epoch": 0.36027713625866054, | |
| "grad_norm": 0.3273457884788513, | |
| "kl": 0.0062103271484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 25.354167461395264, | |
| "epoch": 0.3648960739030023, | |
| "grad_norm": 0.3809925615787506, | |
| "kl": 0.00506591796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 18.791667461395264, | |
| "epoch": 0.3695150115473441, | |
| "grad_norm": 0.3350464105606079, | |
| "kl": 0.0046234130859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 18.83333396911621, | |
| "epoch": 0.3741339491916859, | |
| "grad_norm": 0.16990438103675842, | |
| "kl": 0.01031494140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.9166666865348816, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 26.895833492279053, | |
| "epoch": 0.3787528868360277, | |
| "grad_norm": 42.352142333984375, | |
| "kl": 0.0153961181640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.9166666865348816, | |
| "reward_std": 0.0833333358168602, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 22.791666984558105, | |
| "epoch": 0.3833718244803695, | |
| "grad_norm": 38.687313079833984, | |
| "kl": 0.023101806640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.7708333730697632, | |
| "reward_std": 0.17311252653598785, | |
| "rewards/accuracy_reward": 0.7708333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 15.166666984558105, | |
| "epoch": 0.38799076212471134, | |
| "grad_norm": 11.10213851928711, | |
| "kl": 0.00698089599609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 20.6875, | |
| "epoch": 0.39260969976905313, | |
| "grad_norm": 0.2621253430843353, | |
| "kl": 0.01202392578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 33.02083396911621, | |
| "epoch": 0.3972286374133949, | |
| "grad_norm": 0.24522998929023743, | |
| "kl": 0.005706787109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 25.25, | |
| "epoch": 0.4018475750577367, | |
| "grad_norm": 0.32983526587486267, | |
| "kl": 0.0078277587890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.8958333730697632, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.8958333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 29.416666984558105, | |
| "epoch": 0.4064665127020785, | |
| "grad_norm": 10.310896873474121, | |
| "kl": 0.022125244140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.9375, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 52.645835876464844, | |
| "epoch": 0.4110854503464203, | |
| "grad_norm": 0.3584893047809601, | |
| "kl": 0.02850341796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.8958333730697632, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.8958333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 20.750000476837158, | |
| "epoch": 0.41570438799076215, | |
| "grad_norm": 1.2553701400756836, | |
| "kl": 0.0139312744140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 39.9791693687439, | |
| "epoch": 0.42032332563510394, | |
| "grad_norm": 0.2528979182243347, | |
| "kl": 0.0077056884765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 33.83333396911621, | |
| "epoch": 0.42494226327944573, | |
| "grad_norm": 0.5499228835105896, | |
| "kl": 0.011077880859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.9583333730697632, | |
| "reward_std": 0.0833333358168602, | |
| "rewards/accuracy_reward": 0.9583333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 27.541666984558105, | |
| "epoch": 0.4295612009237875, | |
| "grad_norm": 0.012784978374838829, | |
| "kl": 0.0097503662109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 24.229167461395264, | |
| "epoch": 0.4341801385681293, | |
| "grad_norm": 13.934257507324219, | |
| "kl": 0.014495849609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.9583333730697632, | |
| "reward_std": 0.0833333358168602, | |
| "rewards/accuracy_reward": 0.9583333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 33.33333492279053, | |
| "epoch": 0.4387990762124711, | |
| "grad_norm": 0.22108934819698334, | |
| "kl": 0.0045166015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.8958333730697632, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.8958333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 20.875000953674316, | |
| "epoch": 0.44341801385681295, | |
| "grad_norm": 0.01029158290475607, | |
| "kl": 0.00502777099609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 31.583334922790527, | |
| "epoch": 0.44803695150115475, | |
| "grad_norm": 0.28462010622024536, | |
| "kl": 0.0048828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 15.0, | |
| "epoch": 0.45265588914549654, | |
| "grad_norm": 0.047917552292346954, | |
| "kl": 0.0049591064453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 18.479166984558105, | |
| "epoch": 0.45727482678983833, | |
| "grad_norm": 22.465944290161133, | |
| "kl": 0.0059356689453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9375, | |
| "reward_std": 0.08977919071912766, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 25.083333492279053, | |
| "epoch": 0.4618937644341801, | |
| "grad_norm": 0.008041063323616982, | |
| "kl": 0.003936767578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 37.833335876464844, | |
| "epoch": 0.4665127020785219, | |
| "grad_norm": 14.107870101928711, | |
| "kl": 0.007110595703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.7500000596046448, | |
| "reward_std": 0.1666666716337204, | |
| "rewards/accuracy_reward": 0.7500000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 15.104166984558105, | |
| "epoch": 0.47113163972286376, | |
| "grad_norm": 15.883159637451172, | |
| "kl": 0.0056915283203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 18.979167461395264, | |
| "epoch": 0.47575057736720555, | |
| "grad_norm": 11.000266075134277, | |
| "kl": 0.0062103271484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9375, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 18.875000953674316, | |
| "epoch": 0.48036951501154734, | |
| "grad_norm": 0.06595253199338913, | |
| "kl": 0.0058135986328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.8333333730697632, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.8333333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 20.312500476837158, | |
| "epoch": 0.48498845265588914, | |
| "grad_norm": 7.73568868637085, | |
| "kl": 0.01102447509765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.9583333730697632, | |
| "reward_std": 0.048112522810697556, | |
| "rewards/accuracy_reward": 0.9583333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 33.91666793823242, | |
| "epoch": 0.4896073903002309, | |
| "grad_norm": 1.0296730995178223, | |
| "kl": 0.0074462890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.9375, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 20.000000476837158, | |
| "epoch": 0.4942263279445728, | |
| "grad_norm": 0.011166680604219437, | |
| "kl": 0.0047149658203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 20.4375, | |
| "epoch": 0.49884526558891457, | |
| "grad_norm": 0.09496494382619858, | |
| "kl": 0.00506591796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 19.791666984558105, | |
| "epoch": 0.5034642032332564, | |
| "grad_norm": 0.013560502789914608, | |
| "kl": 0.00475311279296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 15.0, | |
| "epoch": 0.5080831408775982, | |
| "grad_norm": 0.056342754513025284, | |
| "kl": 0.00507354736328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9166666865348816, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 31.125001907348633, | |
| "epoch": 0.5127020785219399, | |
| "grad_norm": 12.659052848815918, | |
| "kl": 0.0098724365234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.9166666865348816, | |
| "reward_std": 0.0833333358168602, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 21.0, | |
| "epoch": 0.5173210161662818, | |
| "grad_norm": 2.68220591545105, | |
| "kl": 0.01091766357421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.9166666865348816, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 29.041666984558105, | |
| "epoch": 0.5219399538106235, | |
| "grad_norm": 38.36140823364258, | |
| "kl": 0.0079193115234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.9375000596046448, | |
| "reward_std": 0.08977919071912766, | |
| "rewards/accuracy_reward": 0.9375000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 26.375000953674316, | |
| "epoch": 0.5265588914549654, | |
| "grad_norm": 4.6412272453308105, | |
| "kl": 0.024200439453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.7708333730697632, | |
| "reward_std": 0.08977919071912766, | |
| "rewards/accuracy_reward": 0.7708333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 24.70833396911621, | |
| "epoch": 0.5311778290993071, | |
| "grad_norm": 0.01729178987443447, | |
| "kl": 0.014862060546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 22.875, | |
| "epoch": 0.535796766743649, | |
| "grad_norm": 1.4758672714233398, | |
| "kl": 0.011444091796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "reward": 1.9166666865348816, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 45.125, | |
| "epoch": 0.5404157043879908, | |
| "grad_norm": 0.3602856695652008, | |
| "kl": 0.01806640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 32.33333396911621, | |
| "epoch": 0.5450346420323325, | |
| "grad_norm": 11.787064552307129, | |
| "kl": 0.0201416015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 33.10416793823242, | |
| "epoch": 0.5496535796766744, | |
| "grad_norm": 16.938676834106445, | |
| "kl": 0.0124664306640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "reward": 1.9375, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 21.854167461395264, | |
| "epoch": 0.5542725173210161, | |
| "grad_norm": 5.987547874450684, | |
| "kl": 0.017303466796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.8958333730697632, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.8958333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 24.729167938232422, | |
| "epoch": 0.558891454965358, | |
| "grad_norm": 0.01790229044854641, | |
| "kl": 0.009857177734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.9166666865348816, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 15.333333492279053, | |
| "epoch": 0.5635103926096998, | |
| "grad_norm": 0.004795704036951065, | |
| "kl": 0.0053558349609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 19.25, | |
| "epoch": 0.5681293302540416, | |
| "grad_norm": 0.021862277761101723, | |
| "kl": 0.0130615234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 18.20833396911621, | |
| "epoch": 0.5727482678983834, | |
| "grad_norm": 0.01567975804209709, | |
| "kl": 0.0094451904296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 21.479166984558105, | |
| "epoch": 0.5773672055427251, | |
| "grad_norm": 8.731499671936035, | |
| "kl": 0.0161590576171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 15.083333492279053, | |
| "epoch": 0.581986143187067, | |
| "grad_norm": 25.284887313842773, | |
| "kl": 0.006011962890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9583333730697632, | |
| "reward_std": 0.048112522810697556, | |
| "rewards/accuracy_reward": 0.9583333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 24.312500953674316, | |
| "epoch": 0.5866050808314087, | |
| "grad_norm": 0.015062646940350533, | |
| "kl": 0.0047454833984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 37.66666793823242, | |
| "epoch": 0.5912240184757506, | |
| "grad_norm": 0.048227909952402115, | |
| "kl": 0.0377197265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.9166666865348816, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 19.291667461395264, | |
| "epoch": 0.5958429561200924, | |
| "grad_norm": 8.027989387512207, | |
| "kl": 0.0084075927734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.8750000596046448, | |
| "reward_std": 0.048112522810697556, | |
| "rewards/accuracy_reward": 0.8750000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 15.083333492279053, | |
| "epoch": 0.6004618937644342, | |
| "grad_norm": 0.0062526981346309185, | |
| "kl": 0.00661468505859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 22.375000476837158, | |
| "epoch": 0.605080831408776, | |
| "grad_norm": 0.026607630774378777, | |
| "kl": 0.0137176513671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 20.95833396911621, | |
| "epoch": 0.6096997690531177, | |
| "grad_norm": 0.00871030893176794, | |
| "kl": 0.0045928955078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 19.291666984558105, | |
| "epoch": 0.6143187066974596, | |
| "grad_norm": 0.0312328077852726, | |
| "kl": 0.0061187744140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9166666865348816, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 35.750000953674316, | |
| "epoch": 0.6189376443418014, | |
| "grad_norm": 0.07837596535682678, | |
| "kl": 0.01300048828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 15.250000476837158, | |
| "epoch": 0.6235565819861432, | |
| "grad_norm": 0.02661307342350483, | |
| "kl": 0.0044708251953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.8333333730697632, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.8333333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 47.916669845581055, | |
| "epoch": 0.628175519630485, | |
| "grad_norm": 0.025101438164711, | |
| "kl": 0.0069580078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.9166666865348816, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 26.479167461395264, | |
| "epoch": 0.6327944572748267, | |
| "grad_norm": 22.01622200012207, | |
| "kl": 0.02655029296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 31.95833396911621, | |
| "epoch": 0.6374133949191686, | |
| "grad_norm": 26.566116333007812, | |
| "kl": 0.012664794921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "reward": 1.8541666865348816, | |
| "reward_std": 0.08977919071912766, | |
| "rewards/accuracy_reward": 0.8541666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 24.416667461395264, | |
| "epoch": 0.6420323325635104, | |
| "grad_norm": 0.47595617175102234, | |
| "kl": 0.006439208984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 20.437500476837158, | |
| "epoch": 0.6466512702078522, | |
| "grad_norm": 0.010236713103950024, | |
| "kl": 0.005096435546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9166666865348816, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.9166666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 35.16666793823242, | |
| "epoch": 0.651270207852194, | |
| "grad_norm": 0.3795807659626007, | |
| "kl": 0.0071563720703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 27.229166984558105, | |
| "epoch": 0.6558891454965358, | |
| "grad_norm": 0.025517858564853668, | |
| "kl": 0.0072021484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 15.0, | |
| "epoch": 0.6605080831408776, | |
| "grad_norm": 7.9069013595581055, | |
| "kl": 0.00848388671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.9791666865348816, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.9791666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 17.83333396911621, | |
| "epoch": 0.6651270207852193, | |
| "grad_norm": 0.014802374877035618, | |
| "kl": 0.00531005859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.8333333730697632, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.8333333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 23.291667461395264, | |
| "epoch": 0.6697459584295612, | |
| "grad_norm": 8.15732479095459, | |
| "kl": 0.005279541015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.9583333730697632, | |
| "reward_std": 0.048112522810697556, | |
| "rewards/accuracy_reward": 0.9583333432674408, | |
| "rewards/format_reward": 1.0, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 21.791667461395264, | |
| "epoch": 0.674364896073903, | |
| "grad_norm": 0.008288037963211536, | |
| "kl": 0.00370025634765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 34.47916793823242, | |
| "epoch": 0.6789838337182448, | |
| "grad_norm": 0.3849964439868927, | |
| "kl": 0.011077880859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.8125000596046448, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.8125000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 28.166667461395264, | |
| "epoch": 0.6836027713625866, | |
| "grad_norm": 0.012812400236725807, | |
| "kl": 0.00555419921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 45.85416793823242, | |
| "epoch": 0.6882217090069284, | |
| "grad_norm": 0.016709528863430023, | |
| "kl": 0.009246826171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 51.25000286102295, | |
| "epoch": 0.6928406466512702, | |
| "grad_norm": 0.6184998154640198, | |
| "kl": 0.012664794921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "reward": 1.9583333730697632, | |
| "reward_std": 0.0833333358168602, | |
| "rewards/accuracy_reward": 0.9583333730697632, | |
| "rewards/format_reward": 1.0, | |
| "step": 150 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 216, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |