| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 10, | |
| "global_step": 938, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 121.971875, | |
| "epoch": 0.010666666666666666, | |
| "grad_norm": 0.156667098402977, | |
| "kl": 2.0313262939453126e-05, | |
| "learning_rate": 1.0638297872340426e-05, | |
| "loss": 0.001, | |
| "reward": 0.0125, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 0.009375, | |
| "rewards/format_reward": 0.003125, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 122.521875, | |
| "epoch": 0.021333333333333333, | |
| "grad_norm": 0.0012713409960269928, | |
| "kl": 0.00021836161613464355, | |
| "learning_rate": 2.1276595744680852e-05, | |
| "loss": 0.0051, | |
| "reward": 0.015625, | |
| "reward_std": 0.025966878235340118, | |
| "rewards/accuracy_reward": 0.015625, | |
| "rewards/format_reward": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 117.5125, | |
| "epoch": 0.032, | |
| "grad_norm": 0.002654253738000989, | |
| "kl": 0.0003068089485168457, | |
| "learning_rate": 3.1914893617021275e-05, | |
| "loss": -0.0002, | |
| "reward": 0.00625, | |
| "reward_std": 0.007216878235340118, | |
| "rewards/accuracy_reward": 0.00625, | |
| "rewards/format_reward": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 118.871875, | |
| "epoch": 0.042666666666666665, | |
| "grad_norm": 0.00353299081325531, | |
| "kl": 0.000412750244140625, | |
| "learning_rate": 4.2553191489361704e-05, | |
| "loss": 0.0055, | |
| "reward": 0.009375, | |
| "reward_std": 0.01875, | |
| "rewards/accuracy_reward": 0.00625, | |
| "rewards/format_reward": 0.003125, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 121.046875, | |
| "epoch": 0.05333333333333334, | |
| "grad_norm": 0.003619612194597721, | |
| "kl": 0.0004070043563842773, | |
| "learning_rate": 5.319148936170213e-05, | |
| "loss": 0.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 119.4375, | |
| "epoch": 0.064, | |
| "grad_norm": 0.11174867302179337, | |
| "kl": 0.00045168399810791016, | |
| "learning_rate": 6.382978723404255e-05, | |
| "loss": 0.0064, | |
| "reward": 0.01875, | |
| "reward_std": 0.0375, | |
| "rewards/accuracy_reward": 0.015625, | |
| "rewards/format_reward": 0.003125, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 119.05625, | |
| "epoch": 0.07466666666666667, | |
| "grad_norm": 0.006828859448432922, | |
| "kl": 0.0011888980865478516, | |
| "learning_rate": 7.446808510638297e-05, | |
| "loss": 0.0012, | |
| "reward": 0.0125, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 0.0125, | |
| "rewards/format_reward": 0.0, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 120.28125, | |
| "epoch": 0.08533333333333333, | |
| "grad_norm": 0.0064537739381194115, | |
| "kl": 0.0019659996032714844, | |
| "learning_rate": 8.510638297872341e-05, | |
| "loss": 0.0028, | |
| "reward": 0.0125, | |
| "reward_std": 0.025, | |
| "rewards/accuracy_reward": 0.0125, | |
| "rewards/format_reward": 0.0, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 117.559375, | |
| "epoch": 0.096, | |
| "grad_norm": 0.09068689495325089, | |
| "kl": 0.0025023460388183595, | |
| "learning_rate": 9.574468085106382e-05, | |
| "loss": 0.003, | |
| "reward": 0.021875, | |
| "reward_std": 0.04375, | |
| "rewards/accuracy_reward": 0.01875, | |
| "rewards/format_reward": 0.003125, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 117.68125, | |
| "epoch": 0.10666666666666667, | |
| "grad_norm": 0.16541939973831177, | |
| "kl": 0.00291900634765625, | |
| "learning_rate": 0.00010638297872340425, | |
| "loss": 0.0008, | |
| "reward": 0.021875, | |
| "reward_std": 0.03125, | |
| "rewards/accuracy_reward": 0.021875, | |
| "rewards/format_reward": 0.0, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 116.971875, | |
| "epoch": 0.11733333333333333, | |
| "grad_norm": 0.07206544280052185, | |
| "kl": 0.0038990020751953126, | |
| "learning_rate": 0.00011702127659574467, | |
| "loss": 0.0026, | |
| "reward": 0.015625, | |
| "reward_std": 0.03125, | |
| "rewards/accuracy_reward": 0.0125, | |
| "rewards/format_reward": 0.003125, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 114.996875, | |
| "epoch": 0.128, | |
| "grad_norm": 0.02286006510257721, | |
| "kl": 0.007346725463867188, | |
| "learning_rate": 0.0001276595744680851, | |
| "loss": 0.0076, | |
| "reward": 0.025, | |
| "reward_std": 0.05, | |
| "rewards/accuracy_reward": 0.015625, | |
| "rewards/format_reward": 0.009375, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 119.315625, | |
| "epoch": 0.13866666666666666, | |
| "grad_norm": 0.015629781410098076, | |
| "kl": 0.008090972900390625, | |
| "learning_rate": 0.00013829787234042552, | |
| "loss": 0.0011, | |
| "reward": 0.009375, | |
| "reward_std": 0.01875, | |
| "rewards/accuracy_reward": 0.009375, | |
| "rewards/format_reward": 0.0, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 121.821875, | |
| "epoch": 0.14933333333333335, | |
| "grad_norm": 0.15498439967632294, | |
| "kl": 0.006272506713867187, | |
| "learning_rate": 0.00014893617021276593, | |
| "loss": -0.0012, | |
| "reward": 0.021875, | |
| "reward_std": 0.03846687823534012, | |
| "rewards/accuracy_reward": 0.01875, | |
| "rewards/format_reward": 0.003125, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 121.409375, | |
| "epoch": 0.16, | |
| "grad_norm": 0.18756870925426483, | |
| "kl": 0.00465240478515625, | |
| "learning_rate": 0.00015957446808510637, | |
| "loss": 0.0012, | |
| "reward": 0.021875, | |
| "reward_std": 0.03846687823534012, | |
| "rewards/accuracy_reward": 0.01875, | |
| "rewards/format_reward": 0.003125, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 118.26875, | |
| "epoch": 0.17066666666666666, | |
| "grad_norm": 0.011626984924077988, | |
| "kl": 0.01092681884765625, | |
| "learning_rate": 0.00017021276595744682, | |
| "loss": -0.0011, | |
| "reward": 0.021875, | |
| "reward_std": 0.03318375647068024, | |
| "rewards/accuracy_reward": 0.01875, | |
| "rewards/format_reward": 0.003125, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 119.25, | |
| "epoch": 0.18133333333333335, | |
| "grad_norm": 0.00764912273734808, | |
| "kl": 0.00976104736328125, | |
| "learning_rate": 0.0001808510638297872, | |
| "loss": 0.0045, | |
| "reward": 0.021875, | |
| "reward_std": 0.04375, | |
| "rewards/accuracy_reward": 0.021875, | |
| "rewards/format_reward": 0.0, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 115.35, | |
| "epoch": 0.192, | |
| "grad_norm": 0.0785018652677536, | |
| "kl": 0.014077377319335938, | |
| "learning_rate": 0.00019148936170212765, | |
| "loss": 0.0037, | |
| "reward": 0.025, | |
| "reward_std": 0.04471687823534012, | |
| "rewards/accuracy_reward": 0.009375, | |
| "rewards/format_reward": 0.015625, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 108.6125, | |
| "epoch": 0.20266666666666666, | |
| "grad_norm": 0.13107918202877045, | |
| "kl": 0.039361572265625, | |
| "learning_rate": 0.00020212765957446807, | |
| "loss": 0.0412, | |
| "reward": 0.11875, | |
| "reward_std": 0.18907372057437896, | |
| "rewards/accuracy_reward": 0.021875, | |
| "rewards/format_reward": 0.096875, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 89.915625, | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 0.19012346863746643, | |
| "kl": 0.08895263671875, | |
| "learning_rate": 0.0002127659574468085, | |
| "loss": 0.1321, | |
| "reward": 0.46875, | |
| "reward_std": 0.41404569447040557, | |
| "rewards/accuracy_reward": 0.015625, | |
| "rewards/format_reward": 0.453125, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 47.74375, | |
| "epoch": 0.224, | |
| "grad_norm": 0.4668453335762024, | |
| "kl": 0.26416015625, | |
| "learning_rate": 0.0002234042553191489, | |
| "loss": 0.0712, | |
| "reward": 0.871875, | |
| "reward_std": 0.19805223047733306, | |
| "rewards/accuracy_reward": 0.01875, | |
| "rewards/format_reward": 0.853125, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 45.15625, | |
| "epoch": 0.23466666666666666, | |
| "grad_norm": 0.21052278578281403, | |
| "kl": 0.3112213134765625, | |
| "learning_rate": 0.00023404255319148934, | |
| "loss": 0.0464, | |
| "reward": 0.890625, | |
| "reward_std": 0.11346687823534012, | |
| "rewards/accuracy_reward": 0.0125, | |
| "rewards/format_reward": 0.878125, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 57.2875, | |
| "epoch": 0.24533333333333332, | |
| "grad_norm": 0.16618619859218597, | |
| "kl": 0.254522705078125, | |
| "learning_rate": 0.00024468085106382976, | |
| "loss": 0.0589, | |
| "reward": 0.834375, | |
| "reward_std": 0.12261751294136047, | |
| "rewards/accuracy_reward": 0.0125, | |
| "rewards/format_reward": 0.821875, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 68.153125, | |
| "epoch": 0.256, | |
| "grad_norm": 0.17739807069301605, | |
| "kl": 0.214471435546875, | |
| "learning_rate": 0.0002553191489361702, | |
| "loss": 0.1375, | |
| "reward": 0.659375, | |
| "reward_std": 0.28527562469244006, | |
| "rewards/accuracy_reward": 0.009375, | |
| "rewards/format_reward": 0.65, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 52.709375, | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 0.09843996912240982, | |
| "kl": 0.2847900390625, | |
| "learning_rate": 0.0002659574468085106, | |
| "loss": 0.1085, | |
| "reward": 0.834375, | |
| "reward_std": 0.290549997985363, | |
| "rewards/accuracy_reward": 0.034375, | |
| "rewards/format_reward": 0.8, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 51.55625, | |
| "epoch": 0.2773333333333333, | |
| "grad_norm": 0.1133696436882019, | |
| "kl": 0.276953125, | |
| "learning_rate": 0.00027659574468085103, | |
| "loss": 0.0437, | |
| "reward": 0.903125, | |
| "reward_std": 0.1361730858683586, | |
| "rewards/accuracy_reward": 0.025, | |
| "rewards/format_reward": 0.878125, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 55.046875, | |
| "epoch": 0.288, | |
| "grad_norm": 0.14536090195178986, | |
| "kl": 0.2501953125, | |
| "learning_rate": 0.0002872340425531915, | |
| "loss": 0.0588, | |
| "reward": 0.878125, | |
| "reward_std": 0.13846687823534012, | |
| "rewards/accuracy_reward": 0.00625, | |
| "rewards/format_reward": 0.871875, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 55.853125, | |
| "epoch": 0.2986666666666667, | |
| "grad_norm": 0.1799221634864807, | |
| "kl": 0.3143310546875, | |
| "learning_rate": 0.00029787234042553186, | |
| "loss": 0.0609, | |
| "reward": 0.90625, | |
| "reward_std": 0.18080126941204072, | |
| "rewards/accuracy_reward": 0.021875, | |
| "rewards/format_reward": 0.884375, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 59.85, | |
| "epoch": 0.30933333333333335, | |
| "grad_norm": 0.10688479989767075, | |
| "kl": 0.20706787109375, | |
| "learning_rate": 0.0002999925930442553, | |
| "loss": 0.0522, | |
| "reward": 0.815625, | |
| "reward_std": 0.2959165498614311, | |
| "rewards/accuracy_reward": 0.065625, | |
| "rewards/format_reward": 0.75, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 64.625, | |
| "epoch": 0.32, | |
| "grad_norm": 0.03851361572742462, | |
| "kl": 0.201220703125, | |
| "learning_rate": 0.00029996250354024344, | |
| "loss": 0.0815, | |
| "reward": 0.8625, | |
| "reward_std": 0.21301814764738083, | |
| "rewards/accuracy_reward": 0.0125, | |
| "rewards/format_reward": 0.85, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 57.95, | |
| "epoch": 0.33066666666666666, | |
| "grad_norm": 0.23480646312236786, | |
| "kl": 0.221240234375, | |
| "learning_rate": 0.0002999092731927958, | |
| "loss": 0.0292, | |
| "reward": 0.921875, | |
| "reward_std": 0.15895397514104842, | |
| "rewards/accuracy_reward": 0.0375, | |
| "rewards/format_reward": 0.884375, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 64.196875, | |
| "epoch": 0.3413333333333333, | |
| "grad_norm": 0.1151675432920456, | |
| "kl": 0.20123291015625, | |
| "learning_rate": 0.0002998329102159332, | |
| "loss": 0.0491, | |
| "reward": 0.83125, | |
| "reward_std": 0.19258119761943818, | |
| "rewards/accuracy_reward": 0.01875, | |
| "rewards/format_reward": 0.8125, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 70.628125, | |
| "epoch": 0.352, | |
| "grad_norm": 0.1377689391374588, | |
| "kl": 0.1906005859375, | |
| "learning_rate": 0.0002997334263932927, | |
| "loss": 0.0841, | |
| "reward": 0.846875, | |
| "reward_std": 0.21890811175107955, | |
| "rewards/accuracy_reward": 0.015625, | |
| "rewards/format_reward": 0.83125, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 61.54375, | |
| "epoch": 0.3626666666666667, | |
| "grad_norm": 0.0947548896074295, | |
| "kl": 0.21240234375, | |
| "learning_rate": 0.0002996108370763087, | |
| "loss": 0.062, | |
| "reward": 0.88125, | |
| "reward_std": 0.13713996410369872, | |
| "rewards/accuracy_reward": 0.015625, | |
| "rewards/format_reward": 0.865625, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 60.109375, | |
| "epoch": 0.37333333333333335, | |
| "grad_norm": 0.14599719643592834, | |
| "kl": 0.2236083984375, | |
| "learning_rate": 0.0002994651611818448, | |
| "loss": 0.0408, | |
| "reward": 0.928125, | |
| "reward_std": 0.18282372057437896, | |
| "rewards/accuracy_reward": 0.028125, | |
| "rewards/format_reward": 0.9, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 62.90625, | |
| "epoch": 0.384, | |
| "grad_norm": 0.3738599121570587, | |
| "kl": 0.2464111328125, | |
| "learning_rate": 0.00029929642118927394, | |
| "loss": 0.0753, | |
| "reward": 0.834375, | |
| "reward_std": 0.20676814764738083, | |
| "rewards/accuracy_reward": 0.025, | |
| "rewards/format_reward": 0.809375, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 70.1625, | |
| "epoch": 0.39466666666666667, | |
| "grad_norm": 2.8762810230255127, | |
| "kl": 0.88681640625, | |
| "learning_rate": 0.00029910464313701013, | |
| "loss": 0.2053, | |
| "reward": 0.640625, | |
| "reward_std": 0.38192625939846037, | |
| "rewards/accuracy_reward": 0.009375, | |
| "rewards/format_reward": 0.63125, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 43.578125, | |
| "epoch": 0.4053333333333333, | |
| "grad_norm": 0.9355350136756897, | |
| "kl": 2.06865234375, | |
| "learning_rate": 0.0002988898566184902, | |
| "loss": 0.2631, | |
| "reward": 0.725, | |
| "reward_std": 0.32462068647146225, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.725, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 44.19375, | |
| "epoch": 0.416, | |
| "grad_norm": 0.709173858165741, | |
| "kl": 3.47939453125, | |
| "learning_rate": 0.0002986520947776074, | |
| "loss": 0.3225, | |
| "reward": 0.6125, | |
| "reward_std": 0.3950331017374992, | |
| "rewards/accuracy_reward": 0.009375, | |
| "rewards/format_reward": 0.603125, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 54.775, | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 0.6549698114395142, | |
| "kl": 4.3202392578125, | |
| "learning_rate": 0.0002983913943035968, | |
| "loss": 0.3808, | |
| "reward": 0.66875, | |
| "reward_std": 0.3901100158691406, | |
| "rewards/accuracy_reward": 0.01875, | |
| "rewards/format_reward": 0.65, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 53.871875, | |
| "epoch": 0.43733333333333335, | |
| "grad_norm": 0.01826515607535839, | |
| "kl": 2.477734375, | |
| "learning_rate": 0.00029810779542537355, | |
| "loss": 0.2661, | |
| "reward": 0.79375, | |
| "reward_std": 0.22999776750802994, | |
| "rewards/accuracy_reward": 0.025, | |
| "rewards/format_reward": 0.76875, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 49.434375, | |
| "epoch": 0.448, | |
| "grad_norm": 0.5134692192077637, | |
| "kl": 2.07587890625, | |
| "learning_rate": 0.0002978013419053255, | |
| "loss": 0.2091, | |
| "reward": 0.771875, | |
| "reward_std": 0.26785253882408144, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.771875, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 59.475, | |
| "epoch": 0.45866666666666667, | |
| "grad_norm": 0.7835673689842224, | |
| "kl": 2.516943359375, | |
| "learning_rate": 0.00029747208103256, | |
| "loss": 0.2312, | |
| "reward": 0.740625, | |
| "reward_std": 0.31220938116312025, | |
| "rewards/accuracy_reward": 0.0125, | |
| "rewards/format_reward": 0.728125, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 58.815625, | |
| "epoch": 0.4693333333333333, | |
| "grad_norm": 0.021143430843949318, | |
| "kl": 2.1959716796875, | |
| "learning_rate": 0.0002971200636156068, | |
| "loss": 0.2386, | |
| "reward": 0.796875, | |
| "reward_std": 0.2231356605887413, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.796875, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 55.696875, | |
| "epoch": 0.48, | |
| "grad_norm": 1.8231980800628662, | |
| "kl": 2.9314697265625, | |
| "learning_rate": 0.00029674534397457745, | |
| "loss": 0.3506, | |
| "reward": 0.796875, | |
| "reward_std": 0.2616912335157394, | |
| "rewards/accuracy_reward": 0.021875, | |
| "rewards/format_reward": 0.775, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 49.778125, | |
| "epoch": 0.49066666666666664, | |
| "grad_norm": 0.5252532362937927, | |
| "kl": 1.47607421875, | |
| "learning_rate": 0.00029634797993278333, | |
| "loss": 0.2026, | |
| "reward": 0.89375, | |
| "reward_std": 0.11293471753597259, | |
| "rewards/accuracy_reward": 0.0125, | |
| "rewards/format_reward": 0.88125, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 57.903125, | |
| "epoch": 0.5013333333333333, | |
| "grad_norm": 0.10718824714422226, | |
| "kl": 1.683837890625, | |
| "learning_rate": 0.000295928032807813, | |
| "loss": 0.1887, | |
| "reward": 0.859375, | |
| "reward_std": 0.1423343911767006, | |
| "rewards/accuracy_reward": 0.00625, | |
| "rewards/format_reward": 0.853125, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 63.734375, | |
| "epoch": 0.512, | |
| "grad_norm": 0.32101932168006897, | |
| "kl": 2.9671875, | |
| "learning_rate": 0.00029548556740206994, | |
| "loss": 0.3254, | |
| "reward": 0.79375, | |
| "reward_std": 0.29874250292778015, | |
| "rewards/accuracy_reward": 0.009375, | |
| "rewards/format_reward": 0.784375, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 66.253125, | |
| "epoch": 0.5226666666666666, | |
| "grad_norm": 0.7132259011268616, | |
| "kl": 2.6101806640625, | |
| "learning_rate": 0.0002950206519927731, | |
| "loss": 0.2574, | |
| "reward": 0.728125, | |
| "reward_std": 0.3086773693561554, | |
| "rewards/accuracy_reward": 0.01875, | |
| "rewards/format_reward": 0.709375, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 64.765625, | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 0.7293491959571838, | |
| "kl": 3.3251220703125, | |
| "learning_rate": 0.00029453335832142075, | |
| "loss": 0.3315, | |
| "reward": 0.75625, | |
| "reward_std": 0.27030970752239225, | |
| "rewards/accuracy_reward": 0.009375, | |
| "rewards/format_reward": 0.746875, | |
| "step": 500 | |
| }, | |
| { | |
| "completion_length": 61.140625, | |
| "epoch": 0.544, | |
| "grad_norm": 0.4900813400745392, | |
| "kl": 1.6069091796875, | |
| "learning_rate": 0.0002940237615827202, | |
| "loss": 0.162, | |
| "reward": 0.86875, | |
| "reward_std": 0.21899680644273758, | |
| "rewards/accuracy_reward": 0.053125, | |
| "rewards/format_reward": 0.815625, | |
| "step": 510 | |
| }, | |
| { | |
| "completion_length": 59.621875, | |
| "epoch": 0.5546666666666666, | |
| "grad_norm": 0.4984245002269745, | |
| "kl": 1.695751953125, | |
| "learning_rate": 0.00029349194041298435, | |
| "loss": 0.2075, | |
| "reward": 0.903125, | |
| "reward_std": 0.16081304997205734, | |
| "rewards/accuracy_reward": 0.034375, | |
| "rewards/format_reward": 0.86875, | |
| "step": 520 | |
| }, | |
| { | |
| "completion_length": 62.209375, | |
| "epoch": 0.5653333333333334, | |
| "grad_norm": 0.25215986371040344, | |
| "kl": 1.5575439453125, | |
| "learning_rate": 0.0002929379768779971, | |
| "loss": 0.1648, | |
| "reward": 0.890625, | |
| "reward_std": 0.17983439117670058, | |
| "rewards/accuracy_reward": 0.040625, | |
| "rewards/format_reward": 0.85, | |
| "step": 530 | |
| }, | |
| { | |
| "completion_length": 65.865625, | |
| "epoch": 0.576, | |
| "grad_norm": 0.1489488184452057, | |
| "kl": 2.0063720703125, | |
| "learning_rate": 0.0002923619564603501, | |
| "loss": 0.187, | |
| "reward": 0.78125, | |
| "reward_std": 0.2043856605887413, | |
| "rewards/accuracy_reward": 0.03125, | |
| "rewards/format_reward": 0.75, | |
| "step": 540 | |
| }, | |
| { | |
| "completion_length": 68.68125, | |
| "epoch": 0.5866666666666667, | |
| "grad_norm": 0.24991311132907867, | |
| "kl": 1.0421142578125, | |
| "learning_rate": 0.00029176396804625135, | |
| "loss": 0.0977, | |
| "reward": 0.909375, | |
| "reward_std": 0.1441847175359726, | |
| "rewards/accuracy_reward": 0.021875, | |
| "rewards/format_reward": 0.8875, | |
| "step": 550 | |
| }, | |
| { | |
| "completion_length": 64.2, | |
| "epoch": 0.5973333333333334, | |
| "grad_norm": 0.7193971872329712, | |
| "kl": 2.2302490234375, | |
| "learning_rate": 0.00029114410391180946, | |
| "loss": 0.2166, | |
| "reward": 0.834375, | |
| "reward_std": 0.2048343911767006, | |
| "rewards/accuracy_reward": 0.015625, | |
| "rewards/format_reward": 0.81875, | |
| "step": 560 | |
| }, | |
| { | |
| "completion_length": 66.01875, | |
| "epoch": 0.608, | |
| "grad_norm": 0.59996098279953, | |
| "kl": 2.652294921875, | |
| "learning_rate": 0.0002905024597087945, | |
| "loss": 0.2907, | |
| "reward": 0.815625, | |
| "reward_std": 0.20596464574337006, | |
| "rewards/accuracy_reward": 0.028125, | |
| "rewards/format_reward": 0.7875, | |
| "step": 570 | |
| }, | |
| { | |
| "completion_length": 69.009375, | |
| "epoch": 0.6186666666666667, | |
| "grad_norm": 0.32363754510879517, | |
| "kl": 0.75888671875, | |
| "learning_rate": 0.0002898391344498775, | |
| "loss": 0.112, | |
| "reward": 0.896875, | |
| "reward_std": 0.14761751294136047, | |
| "rewards/accuracy_reward": 0.034375, | |
| "rewards/format_reward": 0.8625, | |
| "step": 580 | |
| }, | |
| { | |
| "completion_length": 58.490625, | |
| "epoch": 0.6293333333333333, | |
| "grad_norm": 0.5817243456840515, | |
| "kl": 3.2273193359375, | |
| "learning_rate": 0.0002891542304933521, | |
| "loss": 0.3775, | |
| "reward": 0.796875, | |
| "reward_std": 0.24620190411806106, | |
| "rewards/accuracy_reward": 0.00625, | |
| "rewards/format_reward": 0.790625, | |
| "step": 590 | |
| }, | |
| { | |
| "completion_length": 56.890625, | |
| "epoch": 0.64, | |
| "grad_norm": 0.561817467212677, | |
| "kl": 1.345703125, | |
| "learning_rate": 0.00028844785352733924, | |
| "loss": 0.1409, | |
| "reward": 0.884375, | |
| "reward_std": 0.1315855011343956, | |
| "rewards/accuracy_reward": 0.021875, | |
| "rewards/format_reward": 0.8625, | |
| "step": 600 | |
| }, | |
| { | |
| "completion_length": 54.61875, | |
| "epoch": 0.6506666666666666, | |
| "grad_norm": 0.41451311111450195, | |
| "kl": 1.3966552734375, | |
| "learning_rate": 0.00028772011255347873, | |
| "loss": 0.1476, | |
| "reward": 0.890625, | |
| "reward_std": 0.16838996410369872, | |
| "rewards/accuracy_reward": 0.01875, | |
| "rewards/format_reward": 0.871875, | |
| "step": 610 | |
| }, | |
| { | |
| "completion_length": 58.15, | |
| "epoch": 0.6613333333333333, | |
| "grad_norm": 0.38927924633026123, | |
| "kl": 2.2388427734375, | |
| "learning_rate": 0.00028697111987010865, | |
| "loss": 0.2576, | |
| "reward": 0.871875, | |
| "reward_std": 0.1995512694120407, | |
| "rewards/accuracy_reward": 0.028125, | |
| "rewards/format_reward": 0.84375, | |
| "step": 620 | |
| }, | |
| { | |
| "completion_length": 57.20625, | |
| "epoch": 0.672, | |
| "grad_norm": 0.09751415997743607, | |
| "kl": 0.793994140625, | |
| "learning_rate": 0.0002862009910549369, | |
| "loss": 0.0629, | |
| "reward": 0.9375, | |
| "reward_std": 0.10386751294136047, | |
| "rewards/accuracy_reward": 0.01875, | |
| "rewards/format_reward": 0.91875, | |
| "step": 630 | |
| }, | |
| { | |
| "completion_length": 65.846875, | |
| "epoch": 0.6826666666666666, | |
| "grad_norm": 0.2675510048866272, | |
| "kl": 2.466015625, | |
| "learning_rate": 0.0002854098449472061, | |
| "loss": 0.2627, | |
| "reward": 0.79375, | |
| "reward_std": 0.2520918682217598, | |
| "rewards/accuracy_reward": 0.015625, | |
| "rewards/format_reward": 0.778125, | |
| "step": 640 | |
| }, | |
| { | |
| "completion_length": 62.98125, | |
| "epoch": 0.6933333333333334, | |
| "grad_norm": 0.15855202078819275, | |
| "kl": 1.8398193359375, | |
| "learning_rate": 0.00028459780362935527, | |
| "loss": 0.177, | |
| "reward": 0.91875, | |
| "reward_std": 0.15879059880971907, | |
| "rewards/accuracy_reward": 0.0125, | |
| "rewards/format_reward": 0.90625, | |
| "step": 650 | |
| }, | |
| { | |
| "completion_length": 62.6, | |
| "epoch": 0.704, | |
| "grad_norm": 0.12087615579366684, | |
| "kl": 2.0813720703125, | |
| "learning_rate": 0.0002837649924081816, | |
| "loss": 0.1866, | |
| "reward": 0.90625, | |
| "reward_std": 0.19479155987501146, | |
| "rewards/accuracy_reward": 0.03125, | |
| "rewards/format_reward": 0.875, | |
| "step": 660 | |
| }, | |
| { | |
| "completion_length": 65.925, | |
| "epoch": 0.7146666666666667, | |
| "grad_norm": 0.39411771297454834, | |
| "kl": 1.392919921875, | |
| "learning_rate": 0.00028291153979550387, | |
| "loss": 0.2015, | |
| "reward": 0.915625, | |
| "reward_std": 0.1775405988097191, | |
| "rewards/accuracy_reward": 0.028125, | |
| "rewards/format_reward": 0.8875, | |
| "step": 670 | |
| }, | |
| { | |
| "completion_length": 64.378125, | |
| "epoch": 0.7253333333333334, | |
| "grad_norm": 1.1659783124923706, | |
| "kl": 2.8165283203125, | |
| "learning_rate": 0.00028203757748833174, | |
| "loss": 0.3109, | |
| "reward": 0.778125, | |
| "reward_std": 0.21169123351573943, | |
| "rewards/accuracy_reward": 0.003125, | |
| "rewards/format_reward": 0.775, | |
| "step": 680 | |
| }, | |
| { | |
| "completion_length": 57.334375, | |
| "epoch": 0.736, | |
| "grad_norm": 0.27627384662628174, | |
| "kl": 1.0085693359375, | |
| "learning_rate": 0.0002811432403485437, | |
| "loss": 0.1226, | |
| "reward": 0.859375, | |
| "reward_std": 0.11540063470602036, | |
| "rewards/accuracy_reward": 0.009375, | |
| "rewards/format_reward": 0.85, | |
| "step": 690 | |
| }, | |
| { | |
| "completion_length": 54.6, | |
| "epoch": 0.7466666666666667, | |
| "grad_norm": 0.4506663382053375, | |
| "kl": 2.3274169921875, | |
| "learning_rate": 0.00028022866638207624, | |
| "loss": 0.2726, | |
| "reward": 0.853125, | |
| "reward_std": 0.2143363133072853, | |
| "rewards/accuracy_reward": 0.025, | |
| "rewards/format_reward": 0.828125, | |
| "step": 700 | |
| }, | |
| { | |
| "completion_length": 64.646875, | |
| "epoch": 0.7573333333333333, | |
| "grad_norm": 0.24161870777606964, | |
| "kl": 1.10751953125, | |
| "learning_rate": 0.00027929399671762793, | |
| "loss": 0.1497, | |
| "reward": 0.878125, | |
| "reward_std": 0.18096464574337007, | |
| "rewards/accuracy_reward": 0.04375, | |
| "rewards/format_reward": 0.834375, | |
| "step": 710 | |
| }, | |
| { | |
| "completion_length": 65.128125, | |
| "epoch": 0.768, | |
| "grad_norm": 0.22652657330036163, | |
| "kl": 1.7567138671875, | |
| "learning_rate": 0.00027833937558488183, | |
| "loss": 0.1692, | |
| "reward": 0.865625, | |
| "reward_std": 0.19575843811035157, | |
| "rewards/accuracy_reward": 0.059375, | |
| "rewards/format_reward": 0.80625, | |
| "step": 720 | |
| }, | |
| { | |
| "completion_length": 76.259375, | |
| "epoch": 0.7786666666666666, | |
| "grad_norm": 0.46417316794395447, | |
| "kl": 3.4798828125, | |
| "learning_rate": 0.0002773649502922495, | |
| "loss": 0.3618, | |
| "reward": 0.7125, | |
| "reward_std": 0.31879488229751585, | |
| "rewards/accuracy_reward": 0.0125, | |
| "rewards/format_reward": 0.7, | |
| "step": 730 | |
| }, | |
| { | |
| "completion_length": 64.928125, | |
| "epoch": 0.7893333333333333, | |
| "grad_norm": 0.8743041753768921, | |
| "kl": 2.166650390625, | |
| "learning_rate": 0.00027637087120413933, | |
| "loss": 0.2562, | |
| "reward": 0.840625, | |
| "reward_std": 0.2851921945810318, | |
| "rewards/accuracy_reward": 0.0375, | |
| "rewards/format_reward": 0.803125, | |
| "step": 740 | |
| }, | |
| { | |
| "completion_length": 58.028125, | |
| "epoch": 0.8, | |
| "grad_norm": 0.18655003607273102, | |
| "kl": 1.73994140625, | |
| "learning_rate": 0.000275357291717754, | |
| "loss": 0.191, | |
| "reward": 0.909375, | |
| "reward_std": 0.19460364878177644, | |
| "rewards/accuracy_reward": 0.053125, | |
| "rewards/format_reward": 0.85625, | |
| "step": 750 | |
| }, | |
| { | |
| "completion_length": 60.803125, | |
| "epoch": 0.8106666666666666, | |
| "grad_norm": 0.04459076747298241, | |
| "kl": 1.7782470703125, | |
| "learning_rate": 0.0002743243682394195, | |
| "loss": 0.2117, | |
| "reward": 0.83125, | |
| "reward_std": 0.16213996410369874, | |
| "rewards/accuracy_reward": 0.025, | |
| "rewards/format_reward": 0.80625, | |
| "step": 760 | |
| }, | |
| { | |
| "completion_length": 56.203125, | |
| "epoch": 0.8213333333333334, | |
| "grad_norm": 0.04220689460635185, | |
| "kl": 1.7406494140625, | |
| "learning_rate": 0.00027327226016044963, | |
| "loss": 0.1999, | |
| "reward": 0.878125, | |
| "reward_std": 0.1423343911767006, | |
| "rewards/accuracy_reward": 0.034375, | |
| "rewards/format_reward": 0.84375, | |
| "step": 770 | |
| }, | |
| { | |
| "completion_length": 58.375, | |
| "epoch": 0.832, | |
| "grad_norm": 0.3807085156440735, | |
| "kl": 1.8222412109375, | |
| "learning_rate": 0.00027220112983255087, | |
| "loss": 0.2296, | |
| "reward": 0.903125, | |
| "reward_std": 0.20482564270496367, | |
| "rewards/accuracy_reward": 0.034375, | |
| "rewards/format_reward": 0.86875, | |
| "step": 780 | |
| }, | |
| { | |
| "completion_length": 63.378125, | |
| "epoch": 0.8426666666666667, | |
| "grad_norm": 0.01206011138856411, | |
| "kl": 2.458740234375, | |
| "learning_rate": 0.00027111114254276913, | |
| "loss": 0.3096, | |
| "reward": 0.84375, | |
| "reward_std": 0.2114198923110962, | |
| "rewards/accuracy_reward": 0.021875, | |
| "rewards/format_reward": 0.821875, | |
| "step": 790 | |
| }, | |
| { | |
| "completion_length": 59.7625, | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 0.40591439604759216, | |
| "kl": 1.378076171875, | |
| "learning_rate": 0.00027000246648798456, | |
| "loss": 0.1403, | |
| "reward": 0.934375, | |
| "reward_std": 0.14083535224199295, | |
| "rewards/accuracy_reward": 0.03125, | |
| "rewards/format_reward": 0.903125, | |
| "step": 800 | |
| }, | |
| { | |
| "completion_length": 62.284375, | |
| "epoch": 0.864, | |
| "grad_norm": 0.27511999011039734, | |
| "kl": 2.2107177734375, | |
| "learning_rate": 0.0002688752727489565, | |
| "loss": 0.2636, | |
| "reward": 0.8875, | |
| "reward_std": 0.21739855110645295, | |
| "rewards/accuracy_reward": 0.028125, | |
| "rewards/format_reward": 0.859375, | |
| "step": 810 | |
| }, | |
| { | |
| "completion_length": 65.265625, | |
| "epoch": 0.8746666666666667, | |
| "grad_norm": 0.2582601010799408, | |
| "kl": 2.3897705078125, | |
| "learning_rate": 0.00026772973526392453, | |
| "loss": 0.2965, | |
| "reward": 0.83125, | |
| "reward_std": 0.2494538262486458, | |
| "rewards/accuracy_reward": 0.028125, | |
| "rewards/format_reward": 0.803125, | |
| "step": 820 | |
| }, | |
| { | |
| "completion_length": 54.865625, | |
| "epoch": 0.8853333333333333, | |
| "grad_norm": 0.23494267463684082, | |
| "kl": 2.6015625, | |
| "learning_rate": 0.0002665660308017671, | |
| "loss": 0.252, | |
| "reward": 0.9, | |
| "reward_std": 0.23950843811035155, | |
| "rewards/accuracy_reward": 0.04375, | |
| "rewards/format_reward": 0.85625, | |
| "step": 830 | |
| }, | |
| { | |
| "completion_length": 55.503125, | |
| "epoch": 0.896, | |
| "grad_norm": 0.20798054337501526, | |
| "kl": 1.5889892578125, | |
| "learning_rate": 0.000265384338934725, | |
| "loss": 0.1996, | |
| "reward": 0.9375, | |
| "reward_std": 0.20120493620634078, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/format_reward": 0.875, | |
| "step": 840 | |
| }, | |
| { | |
| "completion_length": 59.909375, | |
| "epoch": 0.9066666666666666, | |
| "grad_norm": 0.23807695508003235, | |
| "kl": 1.656982421875, | |
| "learning_rate": 0.00026418484201069055, | |
| "loss": 0.194, | |
| "reward": 0.840625, | |
| "reward_std": 0.17524680644273757, | |
| "rewards/accuracy_reward": 0.034375, | |
| "rewards/format_reward": 0.80625, | |
| "step": 850 | |
| }, | |
| { | |
| "completion_length": 56.74375, | |
| "epoch": 0.9173333333333333, | |
| "grad_norm": 0.21559438109397888, | |
| "kl": 0.813427734375, | |
| "learning_rate": 0.00026296772512507025, | |
| "loss": 0.1054, | |
| "reward": 0.884375, | |
| "reward_std": 0.13916241526603698, | |
| "rewards/accuracy_reward": 0.021875, | |
| "rewards/format_reward": 0.8625, | |
| "step": 860 | |
| }, | |
| { | |
| "completion_length": 62.390625, | |
| "epoch": 0.928, | |
| "grad_norm": 0.1291944831609726, | |
| "kl": 1.9663330078125, | |
| "learning_rate": 0.0002617331760922218, | |
| "loss": 0.2316, | |
| "reward": 0.85625, | |
| "reward_std": 0.15685684233903885, | |
| "rewards/accuracy_reward": 0.0125, | |
| "rewards/format_reward": 0.84375, | |
| "step": 870 | |
| }, | |
| { | |
| "completion_length": 56.009375, | |
| "epoch": 0.9386666666666666, | |
| "grad_norm": 1.045857548713684, | |
| "kl": 1.652001953125, | |
| "learning_rate": 0.0002604813854164726, | |
| "loss": 0.1616, | |
| "reward": 0.9375, | |
| "reward_std": 0.16336943507194518, | |
| "rewards/accuracy_reward": 0.028125, | |
| "rewards/format_reward": 0.909375, | |
| "step": 880 | |
| }, | |
| { | |
| "completion_length": 64.446875, | |
| "epoch": 0.9493333333333334, | |
| "grad_norm": 0.33091413974761963, | |
| "kl": 3.235400390625, | |
| "learning_rate": 0.0002592125462627231, | |
| "loss": 0.3973, | |
| "reward": 0.796875, | |
| "reward_std": 0.2716366216540337, | |
| "rewards/accuracy_reward": 0.015625, | |
| "rewards/format_reward": 0.78125, | |
| "step": 890 | |
| }, | |
| { | |
| "completion_length": 59.0, | |
| "epoch": 0.96, | |
| "grad_norm": 0.25974419713020325, | |
| "kl": 1.752197265625, | |
| "learning_rate": 0.00025792685442663877, | |
| "loss": 0.1938, | |
| "reward": 0.89375, | |
| "reward_std": 0.1826515957713127, | |
| "rewards/accuracy_reward": 0.028125, | |
| "rewards/format_reward": 0.865625, | |
| "step": 900 | |
| }, | |
| { | |
| "completion_length": 57.4125, | |
| "epoch": 0.9706666666666667, | |
| "grad_norm": 0.2569887936115265, | |
| "kl": 2.5720703125, | |
| "learning_rate": 0.00025662450830443733, | |
| "loss": 0.3213, | |
| "reward": 0.846875, | |
| "reward_std": 0.22065922170877456, | |
| "rewards/accuracy_reward": 0.03125, | |
| "rewards/format_reward": 0.815625, | |
| "step": 910 | |
| }, | |
| { | |
| "completion_length": 51.140625, | |
| "epoch": 0.9813333333333333, | |
| "grad_norm": 0.18798935413360596, | |
| "kl": 1.6124267578125, | |
| "learning_rate": 0.0002553057088622736, | |
| "loss": 0.2214, | |
| "reward": 0.925, | |
| "reward_std": 0.1477062076330185, | |
| "rewards/accuracy_reward": 0.015625, | |
| "rewards/format_reward": 0.909375, | |
| "step": 920 | |
| }, | |
| { | |
| "completion_length": 53.921875, | |
| "epoch": 0.992, | |
| "grad_norm": 0.8309330940246582, | |
| "kl": 1.651806640625, | |
| "learning_rate": 0.0002539706596052286, | |
| "loss": 0.1893, | |
| "reward": 0.909375, | |
| "reward_std": 0.16504059880971908, | |
| "rewards/accuracy_reward": 0.03125, | |
| "rewards/format_reward": 0.878125, | |
| "step": 930 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2811, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |