| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.968421052631579, | |
| "eval_steps": 10, | |
| "global_step": 69, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 585.2708511352539, | |
| "epoch": 0.042105263157894736, | |
| "grad_norm": 0.8169388771057129, | |
| "kl": 0.0, | |
| "learning_rate": 4.2857142857142857e-07, | |
| "loss": 0.0297, | |
| "reward": 0.6041666679084301, | |
| "reward_std": 0.13301505148410797, | |
| "rewards/accuracy_reward": 0.6041666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 598.4791946411133, | |
| "epoch": 0.08421052631578947, | |
| "grad_norm": 0.7387536764144897, | |
| "kl": 0.0, | |
| "learning_rate": 8.571428571428571e-07, | |
| "loss": 0.0556, | |
| "reward": 0.6458333414047956, | |
| "reward_std": 0.2350771240890026, | |
| "rewards/accuracy_reward": 0.6458333414047956, | |
| "rewards/format_reward": 0.0, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 571.4166831970215, | |
| "epoch": 0.12631578947368421, | |
| "grad_norm": 0.29826676845550537, | |
| "kl": -2.4423934519290924e-05, | |
| "learning_rate": 1.2857142857142856e-06, | |
| "loss": 0.0482, | |
| "reward": 0.6666666734963655, | |
| "reward_std": 0.30354244261980057, | |
| "rewards/accuracy_reward": 0.6666666734963655, | |
| "rewards/format_reward": 0.0, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 577.4375114440918, | |
| "epoch": 0.16842105263157894, | |
| "grad_norm": 0.3846527636051178, | |
| "kl": 1.6279518604278564e-05, | |
| "learning_rate": 1.7142857142857143e-06, | |
| "loss": 0.049, | |
| "reward": 0.6041666753590107, | |
| "reward_std": 0.1801304966211319, | |
| "rewards/accuracy_reward": 0.6041666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 382.87501525878906, | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 0.6718897223472595, | |
| "kl": 2.345442771911621e-05, | |
| "learning_rate": 2.142857142857143e-06, | |
| "loss": 0.0309, | |
| "reward": 0.8541666753590107, | |
| "reward_std": 0.1801304966211319, | |
| "rewards/accuracy_reward": 0.8541666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 497.8333435058594, | |
| "epoch": 0.25263157894736843, | |
| "grad_norm": 1.5279134511947632, | |
| "kl": 0.0005899444222450256, | |
| "learning_rate": 2.571428571428571e-06, | |
| "loss": 0.0471, | |
| "reward": 0.8333333414047956, | |
| "reward_std": 0.16661180183291435, | |
| "rewards/accuracy_reward": 0.8333333414047956, | |
| "rewards/format_reward": 0.0, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 630.7708473205566, | |
| "epoch": 0.29473684210526313, | |
| "grad_norm": 0.8395237326622009, | |
| "kl": 0.0029467493295669556, | |
| "learning_rate": 3e-06, | |
| "loss": 0.045, | |
| "reward": 0.5000000018626451, | |
| "reward_std": 0.18404608964920044, | |
| "rewards/accuracy_reward": 0.5000000018626451, | |
| "rewards/format_reward": 0.0, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 452.4791831970215, | |
| "epoch": 0.3368421052631579, | |
| "grad_norm": 0.03908955678343773, | |
| "kl": -4.875659942626953e-05, | |
| "learning_rate": 2.9980747607565792e-06, | |
| "loss": 0.0059, | |
| "reward": 0.7708333432674408, | |
| "reward_std": 0.2446802258491516, | |
| "rewards/accuracy_reward": 0.7708333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 664.0625152587891, | |
| "epoch": 0.37894736842105264, | |
| "grad_norm": 0.1090988889336586, | |
| "kl": 0.05594288744032383, | |
| "learning_rate": 2.9923039850878425e-06, | |
| "loss": 0.0021, | |
| "reward": 0.4791666716337204, | |
| "reward_std": 0.05103103816509247, | |
| "rewards/accuracy_reward": 0.4791666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 663.8750076293945, | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 0.9894853234291077, | |
| "kl": 0.13722804933786392, | |
| "learning_rate": 2.982702486492167e-06, | |
| "loss": 0.0697, | |
| "reward": 0.39583333767950535, | |
| "reward_std": 0.1801304928958416, | |
| "rewards/accuracy_reward": 0.39583333767950535, | |
| "rewards/format_reward": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 589.1458473205566, | |
| "epoch": 0.4631578947368421, | |
| "grad_norm": 0.6284940838813782, | |
| "kl": 0.1499086245894432, | |
| "learning_rate": 2.969294911878742e-06, | |
| "loss": 0.057, | |
| "reward": 0.4791666716337204, | |
| "reward_std": 0.299626849591732, | |
| "rewards/accuracy_reward": 0.4791666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 456.3125228881836, | |
| "epoch": 0.5052631578947369, | |
| "grad_norm": 0.2573056221008301, | |
| "kl": 0.1363532543182373, | |
| "learning_rate": 2.9521156782993067e-06, | |
| "loss": 0.0423, | |
| "reward": 0.5833333376795053, | |
| "reward_std": 0.24859581887722015, | |
| "rewards/accuracy_reward": 0.5833333376795053, | |
| "rewards/format_reward": 0.0, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 660.6875152587891, | |
| "epoch": 0.5473684210526316, | |
| "grad_norm": 0.5341625809669495, | |
| "kl": 1.2825465202331543, | |
| "learning_rate": 2.9312088846000733e-06, | |
| "loss": 0.077, | |
| "reward": 0.37500000931322575, | |
| "reward_std": 0.23116153106093407, | |
| "rewards/accuracy_reward": 0.37500000931322575, | |
| "rewards/format_reward": 0.0, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 550.9375190734863, | |
| "epoch": 0.5894736842105263, | |
| "grad_norm": 0.15721110999584198, | |
| "kl": 0.5056180357933044, | |
| "learning_rate": 2.906628198220621e-06, | |
| "loss": 0.0395, | |
| "reward": 0.5416666716337204, | |
| "reward_std": 0.23116152733564377, | |
| "rewards/accuracy_reward": 0.5416666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 436.4166851043701, | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 0.4244968295097351, | |
| "kl": 0.5005574226379395, | |
| "learning_rate": 2.878436717430346e-06, | |
| "loss": 0.0356, | |
| "reward": 0.7291666753590107, | |
| "reward_std": 0.1801304966211319, | |
| "rewards/accuracy_reward": 0.7291666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 589.3958473205566, | |
| "epoch": 0.6736842105263158, | |
| "grad_norm": 0.7619456648826599, | |
| "kl": 1.6163926124572754, | |
| "learning_rate": 2.846706809356113e-06, | |
| "loss": 0.1662, | |
| "reward": 0.5416666753590107, | |
| "reward_std": 0.24859581887722015, | |
| "rewards/accuracy_reward": 0.5416666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 579.854190826416, | |
| "epoch": 0.7157894736842105, | |
| "grad_norm": 0.12104064226150513, | |
| "kl": 0.1672835350036621, | |
| "learning_rate": 2.811519924216873e-06, | |
| "loss": 0.0623, | |
| "reward": 0.7083333488553762, | |
| "reward_std": 0.3506578877568245, | |
| "rewards/accuracy_reward": 0.7083333488553762, | |
| "rewards/format_reward": 0.0, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 544.5000076293945, | |
| "epoch": 0.7578947368421053, | |
| "grad_norm": 0.5459911823272705, | |
| "kl": 1.7731904983520508, | |
| "learning_rate": 2.7729663862421267e-06, | |
| "loss": 0.1406, | |
| "reward": 0.6250000055879354, | |
| "reward_std": 0.24859581515192986, | |
| "rewards/accuracy_reward": 0.6250000055879354, | |
| "rewards/format_reward": 0.0, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 571.2291831970215, | |
| "epoch": 0.8, | |
| "grad_norm": 0.6385266184806824, | |
| "kl": 1.6936622858047485, | |
| "learning_rate": 2.731145161810915e-06, | |
| "loss": 0.1112, | |
| "reward": 0.5625000074505806, | |
| "reward_std": 0.11558076366782188, | |
| "rewards/accuracy_reward": 0.5625000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 517.1666793823242, | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.5825877785682678, | |
| "kl": 1.8140544891357422, | |
| "learning_rate": 2.6861636054065477e-06, | |
| "loss": 0.0989, | |
| "reward": 0.7083333395421505, | |
| "reward_std": 0.19364918768405914, | |
| "rewards/accuracy_reward": 0.7083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 560.062520980835, | |
| "epoch": 0.8842105263157894, | |
| "grad_norm": 1.2789912223815918, | |
| "kl": 3.327120780944824, | |
| "learning_rate": 2.6381371840391863e-06, | |
| "loss": 0.1927, | |
| "reward": 0.5833333414047956, | |
| "reward_std": 0.16661180183291435, | |
| "rewards/accuracy_reward": 0.5833333414047956, | |
| "rewards/format_reward": 0.0, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 582.2500152587891, | |
| "epoch": 0.9263157894736842, | |
| "grad_norm": 1.2971068620681763, | |
| "kl": 2.9201011657714844, | |
| "learning_rate": 2.58718918084368e-06, | |
| "loss": 0.2073, | |
| "reward": 0.3958333469927311, | |
| "reward_std": 0.299626849591732, | |
| "rewards/accuracy_reward": 0.3958333469927311, | |
| "rewards/format_reward": 0.0, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 574.03125, | |
| "epoch": 0.968421052631579, | |
| "grad_norm": 0.9836662411689758, | |
| "kl": 2.003514289855957, | |
| "learning_rate": 2.53345037861353e-06, | |
| "loss": 0.1104, | |
| "reward": 0.7083333432674408, | |
| "reward_std": 0.10206207260489464, | |
| "rewards/accuracy_reward": 0.7083333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 670.1666831970215, | |
| "epoch": 1.0421052631578946, | |
| "grad_norm": 1.2662800550460815, | |
| "kl": 3.6266231536865234, | |
| "learning_rate": 2.477058724083334e-06, | |
| "loss": 0.2207, | |
| "reward": 0.47916667722165585, | |
| "reward_std": 0.1530931107699871, | |
| "rewards/accuracy_reward": 0.47916667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 757.0625152587891, | |
| "epoch": 1.0842105263157895, | |
| "grad_norm": 0.8487921357154846, | |
| "kl": 1.7164249420166016, | |
| "learning_rate": 2.4181589738214946e-06, | |
| "loss": 0.0993, | |
| "reward": 0.45833333395421505, | |
| "reward_std": 0.11949636042118073, | |
| "rewards/accuracy_reward": 0.45833333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 620.5000171661377, | |
| "epoch": 1.1263157894736842, | |
| "grad_norm": 2.0166633129119873, | |
| "kl": 0.5937175750732422, | |
| "learning_rate": 2.3569023226421886e-06, | |
| "loss": 0.1125, | |
| "reward": 0.6041666828095913, | |
| "reward_std": 0.36417658254504204, | |
| "rewards/accuracy_reward": 0.6041666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 661.8541717529297, | |
| "epoch": 1.168421052631579, | |
| "grad_norm": 1.0673203468322754, | |
| "kl": 0.7642230987548828, | |
| "learning_rate": 2.2934460154904436e-06, | |
| "loss": 0.1004, | |
| "reward": 0.5208333395421505, | |
| "reward_std": 0.1530931033194065, | |
| "rewards/accuracy_reward": 0.5208333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 505.583345413208, | |
| "epoch": 1.2105263157894737, | |
| "grad_norm": 2.2793140411376953, | |
| "kl": 0.50286865234375, | |
| "learning_rate": 2.227952943796622e-06, | |
| "loss": 0.1271, | |
| "reward": 0.6875000074505806, | |
| "reward_std": 0.2525114119052887, | |
| "rewards/accuracy_reward": 0.6875000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 590.2500076293945, | |
| "epoch": 1.2526315789473683, | |
| "grad_norm": 0.7321359515190125, | |
| "kl": 0.8715305328369141, | |
| "learning_rate": 2.160591227336452e-06, | |
| "loss": 0.0905, | |
| "reward": 0.6875000074505806, | |
| "reward_std": 0.11558076366782188, | |
| "rewards/accuracy_reward": 0.6875000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 611.6666831970215, | |
| "epoch": 1.2947368421052632, | |
| "grad_norm": 2.0107011795043945, | |
| "kl": 1.0236454010009766, | |
| "learning_rate": 2.091533782669978e-06, | |
| "loss": 0.1705, | |
| "reward": 0.562500013038516, | |
| "reward_std": 0.28219256550073624, | |
| "rewards/accuracy_reward": 0.562500013038516, | |
| "rewards/format_reward": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 567.8333473205566, | |
| "epoch": 1.3368421052631578, | |
| "grad_norm": 2.7980597019195557, | |
| "kl": 1.0768804550170898, | |
| "learning_rate": 2.0209578792672304e-06, | |
| "loss": 0.2222, | |
| "reward": 0.6458333525806665, | |
| "reward_std": 0.40168892592191696, | |
| "rewards/accuracy_reward": 0.6458333525806665, | |
| "rewards/format_reward": 0.0, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 769.7291736602783, | |
| "epoch": 1.3789473684210527, | |
| "grad_norm": 3.1634137630462646, | |
| "kl": 3.4175052642822266, | |
| "learning_rate": 1.9490446844600373e-06, | |
| "loss": 0.1332, | |
| "reward": 0.3541666716337204, | |
| "reward_std": 0.05103103443980217, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 748.3541831970215, | |
| "epoch": 1.4210526315789473, | |
| "grad_norm": 2.0227866172790527, | |
| "kl": 4.861572265625, | |
| "learning_rate": 1.875978798388081e-06, | |
| "loss": 0.2971, | |
| "reward": 0.33333334140479565, | |
| "reward_std": 0.16661180183291435, | |
| "rewards/accuracy_reward": 0.33333334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 787.1250228881836, | |
| "epoch": 1.4631578947368422, | |
| "grad_norm": 2.728727102279663, | |
| "kl": 4.805639266967773, | |
| "learning_rate": 1.8019477801329903e-06, | |
| "loss": 0.2784, | |
| "reward": 0.2708333395421505, | |
| "reward_std": 0.2621145099401474, | |
| "rewards/accuracy_reward": 0.2708333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 545.7083473205566, | |
| "epoch": 1.5052631578947369, | |
| "grad_norm": 0.2891092598438263, | |
| "kl": 1.8428325653076172, | |
| "learning_rate": 1.7271416662568652e-06, | |
| "loss": 0.1843, | |
| "reward": 0.5416666753590107, | |
| "reward_std": 0.24859581515192986, | |
| "rewards/accuracy_reward": 0.5416666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 707.5208435058594, | |
| "epoch": 1.5473684210526315, | |
| "grad_norm": 0.8343372344970703, | |
| "kl": 3.2880020141601562, | |
| "learning_rate": 1.6517524829811483e-06, | |
| "loss": 0.2373, | |
| "reward": 0.3750000074505806, | |
| "reward_std": 0.19364918768405914, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 603.3125152587891, | |
| "epoch": 1.5894736842105264, | |
| "grad_norm": 0.24261082708835602, | |
| "kl": 1.7664175033569336, | |
| "learning_rate": 1.5759737532580691e-06, | |
| "loss": 0.1494, | |
| "reward": 0.5833333376795053, | |
| "reward_std": 0.24859581887722015, | |
| "rewards/accuracy_reward": 0.5833333376795053, | |
| "rewards/format_reward": 0.0, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 460.6250190734863, | |
| "epoch": 1.631578947368421, | |
| "grad_norm": 0.5113571286201477, | |
| "kl": 1.065016746520996, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.1319, | |
| "reward": 0.7083333432674408, | |
| "reward_std": 0.24859581887722015, | |
| "rewards/accuracy_reward": 0.7083333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 664.0625228881836, | |
| "epoch": 1.6736842105263157, | |
| "grad_norm": 1.053536057472229, | |
| "kl": 3.0947265625, | |
| "learning_rate": 1.4240262467419312e-06, | |
| "loss": 0.2513, | |
| "reward": 0.4166666828095913, | |
| "reward_std": 0.2957112602889538, | |
| "rewards/accuracy_reward": 0.4166666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 605.229175567627, | |
| "epoch": 1.7157894736842105, | |
| "grad_norm": 1.1233307123184204, | |
| "kl": 0.8006033897399902, | |
| "learning_rate": 1.348247517018852e-06, | |
| "loss": 0.1101, | |
| "reward": 0.7291666846722364, | |
| "reward_std": 0.33713918924331665, | |
| "rewards/accuracy_reward": 0.7291666846722364, | |
| "rewards/format_reward": 0.0, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 618.6250076293945, | |
| "epoch": 1.7578947368421054, | |
| "grad_norm": 0.22662228345870972, | |
| "kl": 2.493999481201172, | |
| "learning_rate": 1.2728583337431355e-06, | |
| "loss": 0.1554, | |
| "reward": 0.5833333395421505, | |
| "reward_std": 0.18404608592391014, | |
| "rewards/accuracy_reward": 0.5833333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 666.4583511352539, | |
| "epoch": 1.8, | |
| "grad_norm": 0.6697850227355957, | |
| "kl": 2.398993492126465, | |
| "learning_rate": 1.1980522198670096e-06, | |
| "loss": 0.2155, | |
| "reward": 0.39583334140479565, | |
| "reward_std": 0.2350771278142929, | |
| "rewards/accuracy_reward": 0.39583334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 540.479175567627, | |
| "epoch": 1.8421052631578947, | |
| "grad_norm": 0.615111768245697, | |
| "kl": 1.5004844665527344, | |
| "learning_rate": 1.1240212016119191e-06, | |
| "loss": 0.1012, | |
| "reward": 0.6041666846722364, | |
| "reward_std": 0.33713919669389725, | |
| "rewards/accuracy_reward": 0.6041666846722364, | |
| "rewards/format_reward": 0.0, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 553.645845413208, | |
| "epoch": 1.8842105263157896, | |
| "grad_norm": 1.1411396265029907, | |
| "kl": 2.1822094917297363, | |
| "learning_rate": 1.050955315539963e-06, | |
| "loss": 0.1153, | |
| "reward": 0.5833333358168602, | |
| "reward_std": 0.06454972922801971, | |
| "rewards/accuracy_reward": 0.5833333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 620.5625076293945, | |
| "epoch": 1.9263157894736842, | |
| "grad_norm": 1.032118558883667, | |
| "kl": 2.441394805908203, | |
| "learning_rate": 9.790421207327699e-07, | |
| "loss": 0.1346, | |
| "reward": 0.5000000037252903, | |
| "reward_std": 0.19364918768405914, | |
| "rewards/accuracy_reward": 0.5000000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 507.84375, | |
| "epoch": 1.9684210526315788, | |
| "grad_norm": 0.3780059516429901, | |
| "kl": 1.3306865692138672, | |
| "learning_rate": 9.084662173300225e-07, | |
| "loss": 0.0993, | |
| "reward": 0.7708333432674408, | |
| "reward_std": 0.1801304928958416, | |
| "rewards/accuracy_reward": 0.7708333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 639.3958396911621, | |
| "epoch": 2.042105263157895, | |
| "grad_norm": 0.9595404267311096, | |
| "kl": 2.2831053733825684, | |
| "learning_rate": 8.394087726635485e-07, | |
| "loss": 0.1552, | |
| "reward": 0.6458333395421505, | |
| "reward_std": 0.11558076366782188, | |
| "rewards/accuracy_reward": 0.6458333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 461.06250762939453, | |
| "epoch": 2.0842105263157893, | |
| "grad_norm": 0.22546370327472687, | |
| "kl": 1.1452112197875977, | |
| "learning_rate": 7.720470562033787e-07, | |
| "loss": 0.0808, | |
| "reward": 0.7708333395421505, | |
| "reward_std": 0.11558076366782188, | |
| "rewards/accuracy_reward": 0.7708333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 614.6875152587891, | |
| "epoch": 2.126315789473684, | |
| "grad_norm": 1.2044512033462524, | |
| "kl": 2.3818206787109375, | |
| "learning_rate": 7.065539845095568e-07, | |
| "loss": 0.1233, | |
| "reward": 0.5833333432674408, | |
| "reward_std": 0.10206207633018494, | |
| "rewards/accuracy_reward": 0.5833333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 573.7916870117188, | |
| "epoch": 2.168421052631579, | |
| "grad_norm": 0.28790467977523804, | |
| "kl": 1.2321147918701172, | |
| "learning_rate": 6.430976773578113e-07, | |
| "loss": 0.092, | |
| "reward": 0.6875000074505806, | |
| "reward_std": 0.11558076366782188, | |
| "rewards/accuracy_reward": 0.6875000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 584.2916717529297, | |
| "epoch": 2.2105263157894735, | |
| "grad_norm": 0.22598668932914734, | |
| "kl": 1.0693860054016113, | |
| "learning_rate": 5.818410261785057e-07, | |
| "loss": 0.0745, | |
| "reward": 0.6875000018626451, | |
| "reward_std": 0.11558076366782188, | |
| "rewards/accuracy_reward": 0.6875000018626451, | |
| "rewards/format_reward": 0.0, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 675.6875038146973, | |
| "epoch": 2.2526315789473683, | |
| "grad_norm": 0.7614906430244446, | |
| "kl": 1.5083866119384766, | |
| "learning_rate": 5.22941275916667e-07, | |
| "loss": 0.0763, | |
| "reward": 0.41666667349636555, | |
| "reward_std": 0.16661180183291435, | |
| "rewards/accuracy_reward": 0.41666667349636555, | |
| "rewards/format_reward": 0.0, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 415.2083435058594, | |
| "epoch": 2.294736842105263, | |
| "grad_norm": 0.25470709800720215, | |
| "kl": 0.29476356506347656, | |
| "learning_rate": 4.6654962138647007e-07, | |
| "loss": 0.022, | |
| "reward": 0.8125000149011612, | |
| "reward_std": 0.1530931070446968, | |
| "rewards/accuracy_reward": 0.8125000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 528.9583415985107, | |
| "epoch": 2.336842105263158, | |
| "grad_norm": 0.6020339727401733, | |
| "kl": 0.6340560913085938, | |
| "learning_rate": 4.1281081915632036e-07, | |
| "loss": 0.0636, | |
| "reward": 0.5416666734963655, | |
| "reward_std": 0.16661179810762405, | |
| "rewards/accuracy_reward": 0.5416666734963655, | |
| "rewards/format_reward": 0.0, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 557.4583358764648, | |
| "epoch": 2.3789473684210525, | |
| "grad_norm": 0.2363707721233368, | |
| "kl": 0.6674823760986328, | |
| "learning_rate": 3.618628159608137e-07, | |
| "loss": 0.0476, | |
| "reward": 0.7083333432674408, | |
| "reward_std": 0.10206207633018494, | |
| "rewards/accuracy_reward": 0.7083333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 430.66668128967285, | |
| "epoch": 2.4210526315789473, | |
| "grad_norm": 0.4364229738712311, | |
| "kl": 0.07082462310791016, | |
| "learning_rate": 3.1383639459345236e-07, | |
| "loss": 0.0172, | |
| "reward": 0.8333333432674408, | |
| "reward_std": 0.10206207633018494, | |
| "rewards/accuracy_reward": 0.8333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 442.2916793823242, | |
| "epoch": 2.463157894736842, | |
| "grad_norm": 0.7256594896316528, | |
| "kl": 0.24945640563964844, | |
| "learning_rate": 2.688548381890859e-07, | |
| "loss": 0.0578, | |
| "reward": 0.7083333395421505, | |
| "reward_std": 0.19364918768405914, | |
| "rewards/accuracy_reward": 0.7083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 627.0000076293945, | |
| "epoch": 2.5052631578947366, | |
| "grad_norm": 1.6205965280532837, | |
| "kl": 0.4336977005004883, | |
| "learning_rate": 2.2703361375787346e-07, | |
| "loss": 0.1196, | |
| "reward": 0.562500013038516, | |
| "reward_std": 0.28219256922602654, | |
| "rewards/accuracy_reward": 0.562500013038516, | |
| "rewards/format_reward": 0.0, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 615.5833511352539, | |
| "epoch": 2.5473684210526315, | |
| "grad_norm": 0.5687099695205688, | |
| "kl": 0.3663787841796875, | |
| "learning_rate": 1.8848007578312686e-07, | |
| "loss": 0.0396, | |
| "reward": 0.5208333414047956, | |
| "reward_std": 0.299626849591732, | |
| "rewards/accuracy_reward": 0.5208333414047956, | |
| "rewards/format_reward": 0.0, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 508.3958396911621, | |
| "epoch": 2.5894736842105264, | |
| "grad_norm": 0.2119116485118866, | |
| "kl": 0.1974639892578125, | |
| "learning_rate": 1.5329319064388763e-07, | |
| "loss": 0.0028, | |
| "reward": 0.8541666716337204, | |
| "reward_std": 0.05103103443980217, | |
| "rewards/accuracy_reward": 0.8541666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 606.3333435058594, | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 0.17258569598197937, | |
| "kl": 0.7138900756835938, | |
| "learning_rate": 1.215632825696541e-07, | |
| "loss": 0.0576, | |
| "reward": 0.39583334140479565, | |
| "reward_std": 0.2350771203637123, | |
| "rewards/accuracy_reward": 0.39583334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 549.6041831970215, | |
| "epoch": 2.6736842105263157, | |
| "grad_norm": 0.8958130478858948, | |
| "kl": 0.40015602111816406, | |
| "learning_rate": 9.337180177937954e-08, | |
| "loss": 0.0813, | |
| "reward": 0.5625000111758709, | |
| "reward_std": 0.2446802221238613, | |
| "rewards/accuracy_reward": 0.5625000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 578.0000114440918, | |
| "epoch": 2.7157894736842105, | |
| "grad_norm": 0.11735378205776215, | |
| "kl": 0.5429248809814453, | |
| "learning_rate": 6.879111539992677e-08, | |
| "loss": 0.0507, | |
| "reward": 0.7083333358168602, | |
| "reward_std": 0.06454972922801971, | |
| "rewards/accuracy_reward": 0.7083333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 607.8750228881836, | |
| "epoch": 2.7578947368421054, | |
| "grad_norm": 1.0303009748458862, | |
| "kl": 0.9785404205322266, | |
| "learning_rate": 4.788432170069373e-08, | |
| "loss": 0.1647, | |
| "reward": 0.6041666828095913, | |
| "reward_std": 0.36417658627033234, | |
| "rewards/accuracy_reward": 0.6041666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 448.7083511352539, | |
| "epoch": 2.8, | |
| "grad_norm": 1.2717252969741821, | |
| "kl": 0.3193473815917969, | |
| "learning_rate": 3.0705088121258276e-08, | |
| "loss": 0.0794, | |
| "reward": 0.8333333469927311, | |
| "reward_std": 0.23116153106093407, | |
| "rewards/accuracy_reward": 0.8333333469927311, | |
| "rewards/format_reward": 0.0, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 663.6250076293945, | |
| "epoch": 2.8421052631578947, | |
| "grad_norm": 0.1792384833097458, | |
| "kl": 1.4888973236083984, | |
| "learning_rate": 1.729751350783293e-08, | |
| "loss": 0.1661, | |
| "reward": 0.3958333432674408, | |
| "reward_std": 0.28219256550073624, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 563.1250152587891, | |
| "epoch": 2.8842105263157896, | |
| "grad_norm": 1.4438295364379883, | |
| "kl": 1.0477867126464844, | |
| "learning_rate": 7.696014912157268e-09, | |
| "loss": 0.1218, | |
| "reward": 0.7291666865348816, | |
| "reward_std": 0.27258946746587753, | |
| "rewards/accuracy_reward": 0.7291666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 682.6875190734863, | |
| "epoch": 2.9263157894736844, | |
| "grad_norm": 1.317630410194397, | |
| "kl": 2.032796859741211, | |
| "learning_rate": 1.9252392434208623e-09, | |
| "loss": 0.0906, | |
| "reward": 0.3333333358168602, | |
| "reward_std": 0.06454972922801971, | |
| "rewards/accuracy_reward": 0.3333333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 544.9375, | |
| "epoch": 2.968421052631579, | |
| "grad_norm": 0.5226422548294067, | |
| "kl": 1.5402240753173828, | |
| "learning_rate": 0.0, | |
| "loss": 0.1601, | |
| "reward": 0.6875000149011612, | |
| "reward_std": 0.1530931107699871, | |
| "rewards/accuracy_reward": 0.6875000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 2.968421052631579, | |
| "step": 69, | |
| "total_flos": 0.0, | |
| "train_loss": 0.0823538613377436, | |
| "train_runtime": 2262.0879, | |
| "train_samples_per_second": 0.252, | |
| "train_steps_per_second": 0.031 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 69, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |