| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 10, | |
| "global_step": 288, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 1413.1667175292969, | |
| "epoch": 0.003472222222222222, | |
| "grad_norm": 9.813347816467285, | |
| "kl": 0.0, | |
| "learning_rate": 5.172413793103448e-08, | |
| "loss": 0.2647, | |
| "reward": 0.3333333358168602, | |
| "reward_std": 0.30354245007038116, | |
| "rewards/accuracy_reward": 0.3333333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 1227.2916870117188, | |
| "epoch": 0.006944444444444444, | |
| "grad_norm": 10.432541847229004, | |
| "kl": 0.0, | |
| "learning_rate": 1.0344827586206897e-07, | |
| "loss": 0.2764, | |
| "reward": 0.3750000074505806, | |
| "reward_std": 0.4230387955904007, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 1248.2292175292969, | |
| "epoch": 0.010416666666666666, | |
| "grad_norm": 6.082642555236816, | |
| "kl": 0.000865936279296875, | |
| "learning_rate": 1.5517241379310344e-07, | |
| "loss": 0.2414, | |
| "reward": 0.4375000149011612, | |
| "reward_std": 0.37377967685461044, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 1296.2292175292969, | |
| "epoch": 0.013888888888888888, | |
| "grad_norm": 3.262653350830078, | |
| "kl": 0.0013132095336914062, | |
| "learning_rate": 2.0689655172413793e-07, | |
| "loss": 0.0938, | |
| "reward": 0.2916666716337204, | |
| "reward_std": 0.415207602083683, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 1678.8333740234375, | |
| "epoch": 0.017361111111111112, | |
| "grad_norm": 3.2543540000915527, | |
| "kl": 0.00103759765625, | |
| "learning_rate": 2.5862068965517245e-07, | |
| "loss": 0.0921, | |
| "reward": 0.1041666679084301, | |
| "reward_std": 0.1705273985862732, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 1484.7083435058594, | |
| "epoch": 0.020833333333333332, | |
| "grad_norm": 2.4255130290985107, | |
| "kl": 0.0007534027099609375, | |
| "learning_rate": 3.103448275862069e-07, | |
| "loss": 0.0765, | |
| "reward": 0.1875000037252903, | |
| "reward_std": 0.2350771278142929, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 1453.5625610351562, | |
| "epoch": 0.024305555555555556, | |
| "grad_norm": 2.6694440841674805, | |
| "kl": 0.00121307373046875, | |
| "learning_rate": 3.6206896551724143e-07, | |
| "loss": 0.1199, | |
| "reward": 0.2083333395421505, | |
| "reward_std": 0.24859582632780075, | |
| "rewards/accuracy_reward": 0.2083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 1483.2917175292969, | |
| "epoch": 0.027777777777777776, | |
| "grad_norm": 4.727176666259766, | |
| "kl": 0.001918792724609375, | |
| "learning_rate": 4.1379310344827586e-07, | |
| "loss": 0.1706, | |
| "reward": 0.2083333358168602, | |
| "reward_std": 0.20148037374019623, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 1469.9791870117188, | |
| "epoch": 0.03125, | |
| "grad_norm": 7.599963188171387, | |
| "kl": 0.003757476806640625, | |
| "learning_rate": 4.6551724137931035e-07, | |
| "loss": 0.2781, | |
| "reward": 0.2916666679084301, | |
| "reward_std": 0.377695269882679, | |
| "rewards/accuracy_reward": 0.2916666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 1376.4584045410156, | |
| "epoch": 0.034722222222222224, | |
| "grad_norm": 6.380342483520508, | |
| "kl": 0.00640869140625, | |
| "learning_rate": 5.172413793103449e-07, | |
| "loss": 0.2153, | |
| "reward": 0.27083334885537624, | |
| "reward_std": 0.29962683469057083, | |
| "rewards/accuracy_reward": 0.27083334885537624, | |
| "rewards/format_reward": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 1691.3750610351562, | |
| "epoch": 0.03819444444444445, | |
| "grad_norm": 3.349307060241699, | |
| "kl": 0.039398193359375, | |
| "learning_rate": 5.689655172413793e-07, | |
| "loss": 0.158, | |
| "reward": 0.1458333395421505, | |
| "reward_std": 0.21764283254742622, | |
| "rewards/accuracy_reward": 0.1458333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 1433.6250305175781, | |
| "epoch": 0.041666666666666664, | |
| "grad_norm": 3.862722158432007, | |
| "kl": 0.0404052734375, | |
| "learning_rate": 6.206896551724138e-07, | |
| "loss": 0.1546, | |
| "reward": 0.2500000111758709, | |
| "reward_std": 0.24859580025076866, | |
| "rewards/accuracy_reward": 0.2500000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 1470.3125305175781, | |
| "epoch": 0.04513888888888889, | |
| "grad_norm": 5.017617225646973, | |
| "kl": 0.06573486328125, | |
| "learning_rate": 6.724137931034483e-07, | |
| "loss": 0.3684, | |
| "reward": 0.3125000149011612, | |
| "reward_std": 0.38161083310842514, | |
| "rewards/accuracy_reward": 0.3125000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 1778.3750305175781, | |
| "epoch": 0.04861111111111111, | |
| "grad_norm": 23.194419860839844, | |
| "kl": 0.84765625, | |
| "learning_rate": 7.241379310344829e-07, | |
| "loss": 0.1976, | |
| "reward": 0.16666666977107525, | |
| "reward_std": 0.2957112528383732, | |
| "rewards/accuracy_reward": 0.16666666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 1817.6041870117188, | |
| "epoch": 0.052083333333333336, | |
| "grad_norm": 8.560534477233887, | |
| "kl": 0.27681541442871094, | |
| "learning_rate": 7.758620689655173e-07, | |
| "loss": 0.116, | |
| "reward": 0.1041666679084301, | |
| "reward_std": 0.21764282882213593, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 1941.0833740234375, | |
| "epoch": 0.05555555555555555, | |
| "grad_norm": 40.363834381103516, | |
| "kl": 0.58837890625, | |
| "learning_rate": 8.275862068965517e-07, | |
| "loss": 0.0613, | |
| "reward": 0.02083333395421505, | |
| "reward_std": 0.05103103443980217, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 1548.2292175292969, | |
| "epoch": 0.059027777777777776, | |
| "grad_norm": 4.416133880615234, | |
| "kl": 0.3662109375, | |
| "learning_rate": 8.793103448275862e-07, | |
| "loss": 0.2711, | |
| "reward": 0.20833333767950535, | |
| "reward_std": 0.30354245752096176, | |
| "rewards/accuracy_reward": 0.20833333767950535, | |
| "rewards/format_reward": 0.0, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 1781.5625305175781, | |
| "epoch": 0.0625, | |
| "grad_norm": 12.145301818847656, | |
| "kl": 1.225830078125, | |
| "learning_rate": 9.310344827586207e-07, | |
| "loss": 0.0986, | |
| "reward": 0.1041666679084301, | |
| "reward_std": 0.13301505148410797, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 1878.3958740234375, | |
| "epoch": 0.06597222222222222, | |
| "grad_norm": 7.587356090545654, | |
| "kl": 0.373046875, | |
| "learning_rate": 9.827586206896552e-07, | |
| "loss": 0.1127, | |
| "reward": 0.0833333358168602, | |
| "reward_std": 0.20412414520978928, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 1930.6875, | |
| "epoch": 0.06944444444444445, | |
| "grad_norm": 1.6869007349014282, | |
| "kl": 0.43115234375, | |
| "learning_rate": 1.0344827586206898e-06, | |
| "loss": 0.0786, | |
| "reward": 0.06250000186264515, | |
| "reward_std": 0.1530931033194065, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 1857.3125610351562, | |
| "epoch": 0.07291666666666667, | |
| "grad_norm": 1.8587008714675903, | |
| "kl": 0.55419921875, | |
| "learning_rate": 1.0862068965517241e-06, | |
| "loss": 0.1276, | |
| "reward": 0.0833333358168602, | |
| "reward_std": 0.16661180183291435, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 1884.0417175292969, | |
| "epoch": 0.0763888888888889, | |
| "grad_norm": 2.7210440635681152, | |
| "kl": 0.5166015625, | |
| "learning_rate": 1.1379310344827587e-06, | |
| "loss": 0.1005, | |
| "reward": 0.06250000186264515, | |
| "reward_std": 0.11558076739311218, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 1634.3333740234375, | |
| "epoch": 0.0798611111111111, | |
| "grad_norm": 10.677355766296387, | |
| "kl": 0.64892578125, | |
| "learning_rate": 1.1896551724137932e-06, | |
| "loss": 0.1699, | |
| "reward": 0.1250000037252903, | |
| "reward_std": 0.23116153106093407, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 1794.3750305175781, | |
| "epoch": 0.08333333333333333, | |
| "grad_norm": 3.3855395317077637, | |
| "kl": 0.61962890625, | |
| "learning_rate": 1.2413793103448275e-06, | |
| "loss": 0.1481, | |
| "reward": 0.12500000186264515, | |
| "reward_std": 0.18404608592391014, | |
| "rewards/accuracy_reward": 0.12500000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 1559.6042175292969, | |
| "epoch": 0.08680555555555555, | |
| "grad_norm": 954.7173461914062, | |
| "kl": 7.8359375, | |
| "learning_rate": 1.293103448275862e-06, | |
| "loss": 0.6252, | |
| "reward": 0.2291666679084301, | |
| "reward_std": 0.1801304891705513, | |
| "rewards/accuracy_reward": 0.2291666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 1605.0625610351562, | |
| "epoch": 0.09027777777777778, | |
| "grad_norm": 39.34937286376953, | |
| "kl": 0.4267578125, | |
| "learning_rate": 1.3448275862068966e-06, | |
| "loss": 0.2043, | |
| "reward": 0.1875000074505806, | |
| "reward_std": 0.24468021839857101, | |
| "rewards/accuracy_reward": 0.1875000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 1480.5, | |
| "epoch": 0.09375, | |
| "grad_norm": 14.309903144836426, | |
| "kl": 0.266357421875, | |
| "learning_rate": 1.396551724137931e-06, | |
| "loss": 0.1969, | |
| "reward": 0.3125000111758709, | |
| "reward_std": 0.2446802221238613, | |
| "rewards/accuracy_reward": 0.3125000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 1569.2083740234375, | |
| "epoch": 0.09722222222222222, | |
| "grad_norm": 28.465957641601562, | |
| "kl": 0.291748046875, | |
| "learning_rate": 1.4482758620689657e-06, | |
| "loss": 0.2874, | |
| "reward": 0.31250000186264515, | |
| "reward_std": 0.34674228355288506, | |
| "rewards/accuracy_reward": 0.31250000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 1323.6042022705078, | |
| "epoch": 0.10069444444444445, | |
| "grad_norm": 180.43153381347656, | |
| "kl": 0.9884033203125, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.3185, | |
| "reward": 0.3750000037252903, | |
| "reward_std": 0.26603010296821594, | |
| "rewards/accuracy_reward": 0.3750000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 1678.6875610351562, | |
| "epoch": 0.10416666666666667, | |
| "grad_norm": 3989.0673828125, | |
| "kl": 23.61474609375, | |
| "learning_rate": 1.5517241379310346e-06, | |
| "loss": 1.1653, | |
| "reward": 0.1666666679084301, | |
| "reward_std": 0.24859581142663956, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 1111.4167175292969, | |
| "epoch": 0.1076388888888889, | |
| "grad_norm": 21596.00390625, | |
| "kl": 47.39111328125, | |
| "learning_rate": 1.603448275862069e-06, | |
| "loss": 3.129, | |
| "reward": 0.4583333469927311, | |
| "reward_std": 0.31314555555582047, | |
| "rewards/accuracy_reward": 0.4583333469927311, | |
| "rewards/format_reward": 0.0, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 1372.6250610351562, | |
| "epoch": 0.1111111111111111, | |
| "grad_norm": 1235.3475341796875, | |
| "kl": 7.119140625, | |
| "learning_rate": 1.6551724137931035e-06, | |
| "loss": 0.6297, | |
| "reward": 0.3125000074505806, | |
| "reward_std": 0.36417656391859055, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 1595.4792175292969, | |
| "epoch": 0.11458333333333333, | |
| "grad_norm": 532.248046875, | |
| "kl": 3.2314453125, | |
| "learning_rate": 1.706896551724138e-06, | |
| "loss": 0.4466, | |
| "reward": 0.33333334885537624, | |
| "reward_std": 0.3680921532213688, | |
| "rewards/accuracy_reward": 0.33333334885537624, | |
| "rewards/format_reward": 0.0, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 1461.5417175292969, | |
| "epoch": 0.11805555555555555, | |
| "grad_norm": 57.97556686401367, | |
| "kl": 1.5225830078125, | |
| "learning_rate": 1.7586206896551723e-06, | |
| "loss": 0.3115, | |
| "reward": 0.3125000111758709, | |
| "reward_std": 0.334495410323143, | |
| "rewards/accuracy_reward": 0.3125000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 1379.9166717529297, | |
| "epoch": 0.12152777777777778, | |
| "grad_norm": 134.0404510498047, | |
| "kl": 1.67919921875, | |
| "learning_rate": 1.8103448275862069e-06, | |
| "loss": 0.4142, | |
| "reward": 0.3750000186264515, | |
| "reward_std": 0.4248107075691223, | |
| "rewards/accuracy_reward": 0.3750000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 1395.6041870117188, | |
| "epoch": 0.125, | |
| "grad_norm": 2129.449462890625, | |
| "kl": 17.14453125, | |
| "learning_rate": 1.8620689655172414e-06, | |
| "loss": 1.2491, | |
| "reward": 0.3750000149011612, | |
| "reward_std": 0.4152076169848442, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 1387.2708740234375, | |
| "epoch": 0.1284722222222222, | |
| "grad_norm": 149.7142333984375, | |
| "kl": 3.673828125, | |
| "learning_rate": 1.913793103448276e-06, | |
| "loss": 0.3786, | |
| "reward": 0.37500001303851604, | |
| "reward_std": 0.3506578765809536, | |
| "rewards/accuracy_reward": 0.37500001303851604, | |
| "rewards/format_reward": 0.0, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 1370.5834045410156, | |
| "epoch": 0.13194444444444445, | |
| "grad_norm": 100.72909545898438, | |
| "kl": 0.69140625, | |
| "learning_rate": 1.9655172413793105e-06, | |
| "loss": 0.3086, | |
| "reward": 0.354166679084301, | |
| "reward_std": 0.3170611336827278, | |
| "rewards/accuracy_reward": 0.354166679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 1317.0000305175781, | |
| "epoch": 0.13541666666666666, | |
| "grad_norm": 85.56684112548828, | |
| "kl": 0.57080078125, | |
| "learning_rate": 2.017241379310345e-06, | |
| "loss": 0.3471, | |
| "reward": 0.3958333358168602, | |
| "reward_std": 0.37377967685461044, | |
| "rewards/accuracy_reward": 0.3958333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 1274.2916870117188, | |
| "epoch": 0.1388888888888889, | |
| "grad_norm": 24.658910751342773, | |
| "kl": 0.56005859375, | |
| "learning_rate": 2.0689655172413796e-06, | |
| "loss": 0.1716, | |
| "reward": 0.3958333432674408, | |
| "reward_std": 0.2350771352648735, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 1396.3541870117188, | |
| "epoch": 0.1423611111111111, | |
| "grad_norm": 54.371578216552734, | |
| "kl": 1.95703125, | |
| "learning_rate": 2.1206896551724137e-06, | |
| "loss": 0.3221, | |
| "reward": 0.2708333432674408, | |
| "reward_std": 0.2621144950389862, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 1410.4791870117188, | |
| "epoch": 0.14583333333333334, | |
| "grad_norm": 2246.173095703125, | |
| "kl": 20.494140625, | |
| "learning_rate": 2.1724137931034482e-06, | |
| "loss": 1.1031, | |
| "reward": 0.25, | |
| "reward_std": 0.18404608219861984, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 0.0, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 1208.9167175292969, | |
| "epoch": 0.14930555555555555, | |
| "grad_norm": 22.791847229003906, | |
| "kl": 0.71044921875, | |
| "learning_rate": 2.2241379310344828e-06, | |
| "loss": 0.2223, | |
| "reward": 0.5625000149011612, | |
| "reward_std": 0.3990451544523239, | |
| "rewards/accuracy_reward": 0.5625000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 1377.2709045410156, | |
| "epoch": 0.1527777777777778, | |
| "grad_norm": 206.04298400878906, | |
| "kl": 2.458984375, | |
| "learning_rate": 2.2758620689655173e-06, | |
| "loss": 0.3242, | |
| "reward": 0.3541666753590107, | |
| "reward_std": 0.35457348823547363, | |
| "rewards/accuracy_reward": 0.3541666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 1330.4791870117188, | |
| "epoch": 0.15625, | |
| "grad_norm": 1637.8236083984375, | |
| "kl": 6.103515625, | |
| "learning_rate": 2.327586206896552e-06, | |
| "loss": 0.5009, | |
| "reward": 0.4375000111758709, | |
| "reward_std": 0.317061148583889, | |
| "rewards/accuracy_reward": 0.4375000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 1766.1459045410156, | |
| "epoch": 0.1597222222222222, | |
| "grad_norm": 48.82515335083008, | |
| "kl": 1.3642578125, | |
| "learning_rate": 2.3793103448275864e-06, | |
| "loss": 0.2552, | |
| "reward": 0.14583333767950535, | |
| "reward_std": 0.2446802258491516, | |
| "rewards/accuracy_reward": 0.14583333767950535, | |
| "rewards/format_reward": 0.0, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 1520.4166870117188, | |
| "epoch": 0.16319444444444445, | |
| "grad_norm": 17.03814697265625, | |
| "kl": 0.9755859375, | |
| "learning_rate": 2.4310344827586205e-06, | |
| "loss": 0.318, | |
| "reward": 0.2500000074505806, | |
| "reward_std": 0.31314555555582047, | |
| "rewards/accuracy_reward": 0.2500000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 1801.7500305175781, | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 11.916468620300293, | |
| "kl": 0.7470703125, | |
| "learning_rate": 2.482758620689655e-06, | |
| "loss": 0.1852, | |
| "reward": 0.16666666977107525, | |
| "reward_std": 0.24859581515192986, | |
| "rewards/accuracy_reward": 0.16666666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 1811.5416870117188, | |
| "epoch": 0.1701388888888889, | |
| "grad_norm": 276.9430236816406, | |
| "kl": 3.390625, | |
| "learning_rate": 2.5344827586206896e-06, | |
| "loss": 0.2597, | |
| "reward": 0.1041666679084301, | |
| "reward_std": 0.21764283254742622, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 1849.9167175292969, | |
| "epoch": 0.1736111111111111, | |
| "grad_norm": 462.90350341796875, | |
| "kl": 5.150390625, | |
| "learning_rate": 2.586206896551724e-06, | |
| "loss": 0.3538, | |
| "reward": 0.0833333358168602, | |
| "reward_std": 0.16661179810762405, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 1779.5208435058594, | |
| "epoch": 0.17708333333333334, | |
| "grad_norm": 11.4965238571167, | |
| "kl": 0.43212890625, | |
| "learning_rate": 2.6379310344827587e-06, | |
| "loss": 0.1934, | |
| "reward": 0.1666666679084301, | |
| "reward_std": 0.24859581142663956, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 1853.7500305175781, | |
| "epoch": 0.18055555555555555, | |
| "grad_norm": 16389.8828125, | |
| "kl": 317.66796875, | |
| "learning_rate": 2.6896551724137932e-06, | |
| "loss": 12.8975, | |
| "reward": 0.1041666679084301, | |
| "reward_std": 0.1705273911356926, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 1800.0, | |
| "epoch": 0.1840277777777778, | |
| "grad_norm": 434.0151672363281, | |
| "kl": 5.490234375, | |
| "learning_rate": 2.7413793103448278e-06, | |
| "loss": 0.3164, | |
| "reward": 0.1666666679084301, | |
| "reward_std": 0.20412413775920868, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 1855.1041870117188, | |
| "epoch": 0.1875, | |
| "grad_norm": 747.1852416992188, | |
| "kl": 3.0390625, | |
| "learning_rate": 2.793103448275862e-06, | |
| "loss": 0.2443, | |
| "reward": 0.08333333395421505, | |
| "reward_std": 0.16661179438233376, | |
| "rewards/accuracy_reward": 0.08333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 1740.1042175292969, | |
| "epoch": 0.1909722222222222, | |
| "grad_norm": 46.64550018310547, | |
| "kl": 0.7890625, | |
| "learning_rate": 2.8448275862068964e-06, | |
| "loss": 0.2331, | |
| "reward": 0.1666666679084301, | |
| "reward_std": 0.2861081510782242, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 1567.0000305175781, | |
| "epoch": 0.19444444444444445, | |
| "grad_norm": 69.09370422363281, | |
| "kl": 0.7421875, | |
| "learning_rate": 2.8965517241379314e-06, | |
| "loss": 0.2864, | |
| "reward": 0.3125000037252903, | |
| "reward_std": 0.37377967685461044, | |
| "rewards/accuracy_reward": 0.3125000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 1466.5416870117188, | |
| "epoch": 0.19791666666666666, | |
| "grad_norm": 5064.72265625, | |
| "kl": 9.0234375, | |
| "learning_rate": 2.9482758620689655e-06, | |
| "loss": 0.9781, | |
| "reward": 0.3541666679084301, | |
| "reward_std": 0.30922994762659073, | |
| "rewards/accuracy_reward": 0.3541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 1425.125015258789, | |
| "epoch": 0.2013888888888889, | |
| "grad_norm": 123.96880340576172, | |
| "kl": 1.74566650390625, | |
| "learning_rate": 3e-06, | |
| "loss": 0.2074, | |
| "reward": 0.3541666828095913, | |
| "reward_std": 0.28219255805015564, | |
| "rewards/accuracy_reward": 0.3541666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 1762.5416870117188, | |
| "epoch": 0.2048611111111111, | |
| "grad_norm": 1912.9332275390625, | |
| "kl": 10.77734375, | |
| "learning_rate": 2.9999724132742073e-06, | |
| "loss": 0.5593, | |
| "reward": 0.1875000111758709, | |
| "reward_std": 0.1801304891705513, | |
| "rewards/accuracy_reward": 0.1875000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 1470.0416870117188, | |
| "epoch": 0.20833333333333334, | |
| "grad_norm": 24224.904296875, | |
| "kl": 23.6171875, | |
| "learning_rate": 2.9998896541115324e-06, | |
| "loss": 1.491, | |
| "reward": 0.3750000223517418, | |
| "reward_std": 0.33057980239391327, | |
| "rewards/accuracy_reward": 0.3750000223517418, | |
| "rewards/format_reward": 0.0, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 1698.1875610351562, | |
| "epoch": 0.21180555555555555, | |
| "grad_norm": 2391.64306640625, | |
| "kl": 4.623046875, | |
| "learning_rate": 2.9997517255560477e-06, | |
| "loss": 0.366, | |
| "reward": 0.1875000037252903, | |
| "reward_std": 0.28219256550073624, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 1820.4166870117188, | |
| "epoch": 0.2152777777777778, | |
| "grad_norm": 51.48255157470703, | |
| "kl": 1.201171875, | |
| "learning_rate": 2.999558632681083e-06, | |
| "loss": 0.1731, | |
| "reward": 0.12500000558793545, | |
| "reward_std": 0.16661179810762405, | |
| "rewards/accuracy_reward": 0.12500000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 1663.9791870117188, | |
| "epoch": 0.21875, | |
| "grad_norm": 137.2998809814453, | |
| "kl": 0.9423828125, | |
| "learning_rate": 2.9993103825890386e-06, | |
| "loss": 0.2046, | |
| "reward": 0.2708333395421505, | |
| "reward_std": 0.2525114193558693, | |
| "rewards/accuracy_reward": 0.2708333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 1713.9583740234375, | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 61.36538314819336, | |
| "kl": 1.259765625, | |
| "learning_rate": 2.999006984411124e-06, | |
| "loss": 0.1113, | |
| "reward": 0.1458333358168602, | |
| "reward_std": 0.13301505148410797, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 1711.3958740234375, | |
| "epoch": 0.22569444444444445, | |
| "grad_norm": 59.91830062866211, | |
| "kl": 0.81640625, | |
| "learning_rate": 2.9986484493070226e-06, | |
| "loss": 0.203, | |
| "reward": 0.16666667349636555, | |
| "reward_std": 0.16661180183291435, | |
| "rewards/accuracy_reward": 0.16666667349636555, | |
| "rewards/format_reward": 0.0, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 1784.6667175292969, | |
| "epoch": 0.22916666666666666, | |
| "grad_norm": 183.44163513183594, | |
| "kl": 2.037109375, | |
| "learning_rate": 2.9982347904644796e-06, | |
| "loss": 0.2535, | |
| "reward": 0.2083333358168602, | |
| "reward_std": 0.30354244634509087, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 1773.5833740234375, | |
| "epoch": 0.2326388888888889, | |
| "grad_norm": 40.59729766845703, | |
| "kl": 1.408203125, | |
| "learning_rate": 2.99776602309882e-06, | |
| "loss": 0.1699, | |
| "reward": 0.08333333395421505, | |
| "reward_std": 0.11949636042118073, | |
| "rewards/accuracy_reward": 0.08333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 1607.2291870117188, | |
| "epoch": 0.2361111111111111, | |
| "grad_norm": 190.56307983398438, | |
| "kl": 2.71875, | |
| "learning_rate": 2.997242164452386e-06, | |
| "loss": 0.3546, | |
| "reward": 0.22916667349636555, | |
| "reward_std": 0.2350771203637123, | |
| "rewards/accuracy_reward": 0.22916667349636555, | |
| "rewards/format_reward": 0.0, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 1859.0416870117188, | |
| "epoch": 0.23958333333333334, | |
| "grad_norm": 14.658742904663086, | |
| "kl": 0.892578125, | |
| "learning_rate": 2.996663233793904e-06, | |
| "loss": 0.1354, | |
| "reward": 0.0833333358168602, | |
| "reward_std": 0.16661180555820465, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 1936.6041870117188, | |
| "epoch": 0.24305555555555555, | |
| "grad_norm": 4.933399200439453, | |
| "kl": 0.8916015625, | |
| "learning_rate": 2.9960292524177757e-06, | |
| "loss": 0.0835, | |
| "reward": 0.0416666679084301, | |
| "reward_std": 0.06454972922801971, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 1590.2500305175781, | |
| "epoch": 0.2465277777777778, | |
| "grad_norm": 53.29322052001953, | |
| "kl": 0.452880859375, | |
| "learning_rate": 2.995340243643295e-06, | |
| "loss": 0.2016, | |
| "reward": 0.18750000558793545, | |
| "reward_std": 0.2350771315395832, | |
| "rewards/accuracy_reward": 0.18750000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 1518.1250610351562, | |
| "epoch": 0.25, | |
| "grad_norm": 24.75818634033203, | |
| "kl": 0.4029541015625, | |
| "learning_rate": 2.99459623281379e-06, | |
| "loss": 0.1579, | |
| "reward": 0.20833333395421505, | |
| "reward_std": 0.16661179438233376, | |
| "rewards/accuracy_reward": 0.20833333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 1581.8333740234375, | |
| "epoch": 0.2534722222222222, | |
| "grad_norm": 10.875574111938477, | |
| "kl": 0.5751953125, | |
| "learning_rate": 2.993797247295691e-06, | |
| "loss": 0.211, | |
| "reward": 0.25, | |
| "reward_std": 0.31314554065465927, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 0.0, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 1749.5000305175781, | |
| "epoch": 0.2569444444444444, | |
| "grad_norm": 8.835053443908691, | |
| "kl": 0.7529296875, | |
| "learning_rate": 2.992943316477524e-06, | |
| "loss": 0.128, | |
| "reward": 0.08333333395421505, | |
| "reward_std": 0.16661179438233376, | |
| "rewards/accuracy_reward": 0.08333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 1679.5000305175781, | |
| "epoch": 0.2604166666666667, | |
| "grad_norm": 140.9902801513672, | |
| "kl": 1.3974609375, | |
| "learning_rate": 2.992034471768829e-06, | |
| "loss": 0.1766, | |
| "reward": 0.08333333395421505, | |
| "reward_std": 0.11949635669589043, | |
| "rewards/accuracy_reward": 0.08333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 1406.1875610351562, | |
| "epoch": 0.2638888888888889, | |
| "grad_norm": 22.414833068847656, | |
| "kl": 1.009765625, | |
| "learning_rate": 2.991070746599007e-06, | |
| "loss": 0.2882, | |
| "reward": 0.2083333395421505, | |
| "reward_std": 0.2861081659793854, | |
| "rewards/accuracy_reward": 0.2083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 1700.7708435058594, | |
| "epoch": 0.2673611111111111, | |
| "grad_norm": 12.40774154663086, | |
| "kl": 0.912109375, | |
| "learning_rate": 2.9900521764160856e-06, | |
| "loss": 0.1658, | |
| "reward": 0.14583333767950535, | |
| "reward_std": 0.21764283627271652, | |
| "rewards/accuracy_reward": 0.14583333767950535, | |
| "rewards/format_reward": 0.0, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 1774.4583740234375, | |
| "epoch": 0.2708333333333333, | |
| "grad_norm": 13.127084732055664, | |
| "kl": 1.1982421875, | |
| "learning_rate": 2.988978798685421e-06, | |
| "loss": 0.115, | |
| "reward": 0.06250000186264515, | |
| "reward_std": 0.11558076366782188, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 1813.0625, | |
| "epoch": 0.2743055555555556, | |
| "grad_norm": 1965.3333740234375, | |
| "kl": 16.447265625, | |
| "learning_rate": 2.987850652888315e-06, | |
| "loss": 0.7641, | |
| "reward": 0.1041666679084301, | |
| "reward_std": 0.13301505148410797, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 1689.8333435058594, | |
| "epoch": 0.2777777777777778, | |
| "grad_norm": 643.5816040039062, | |
| "kl": 2.099609375, | |
| "learning_rate": 2.986667780520568e-06, | |
| "loss": 0.2512, | |
| "reward": 0.1875000037252903, | |
| "reward_std": 0.19756478071212769, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 1745.7083740234375, | |
| "epoch": 0.28125, | |
| "grad_norm": 321.9258117675781, | |
| "kl": 4.794921875, | |
| "learning_rate": 2.985430225090945e-06, | |
| "loss": 0.2189, | |
| "reward": 0.02083333395421505, | |
| "reward_std": 0.05103103816509247, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 1683.5208435058594, | |
| "epoch": 0.2847222222222222, | |
| "grad_norm": 1353.93310546875, | |
| "kl": 1.09130859375, | |
| "learning_rate": 2.9841380321195844e-06, | |
| "loss": 0.2424, | |
| "reward": 0.20833333395421505, | |
| "reward_std": 0.25642700120806694, | |
| "rewards/accuracy_reward": 0.20833333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 1496.6250305175781, | |
| "epoch": 0.2881944444444444, | |
| "grad_norm": 13.8945951461792, | |
| "kl": 0.5712890625, | |
| "learning_rate": 2.9827912491363164e-06, | |
| "loss": 0.2168, | |
| "reward": 0.31250000558793545, | |
| "reward_std": 0.2525114119052887, | |
| "rewards/accuracy_reward": 0.31250000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 1610.3125305175781, | |
| "epoch": 0.2916666666666667, | |
| "grad_norm": 802.035888671875, | |
| "kl": 3.7255859375, | |
| "learning_rate": 2.9813899256789174e-06, | |
| "loss": 0.5385, | |
| "reward": 0.25000000558793545, | |
| "reward_std": 0.30354243889451027, | |
| "rewards/accuracy_reward": 0.25000000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 1524.6875305175781, | |
| "epoch": 0.2951388888888889, | |
| "grad_norm": 3.180734157562256, | |
| "kl": 0.5205078125, | |
| "learning_rate": 2.97993411329129e-06, | |
| "loss": 0.1459, | |
| "reward": 0.1875000037252903, | |
| "reward_std": 0.2350771203637123, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 1729.4167175292969, | |
| "epoch": 0.2986111111111111, | |
| "grad_norm": 18.083194732666016, | |
| "kl": 0.7119140625, | |
| "learning_rate": 2.978423865521563e-06, | |
| "loss": 0.2276, | |
| "reward": 0.1875, | |
| "reward_std": 0.1705273985862732, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/format_reward": 0.0, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 1700.7708740234375, | |
| "epoch": 0.3020833333333333, | |
| "grad_norm": 10.720458030700684, | |
| "kl": 0.7744140625, | |
| "learning_rate": 2.9768592379201243e-06, | |
| "loss": 0.2174, | |
| "reward": 0.20833334140479565, | |
| "reward_std": 0.3131455294787884, | |
| "rewards/accuracy_reward": 0.20833334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 1557.9167175292969, | |
| "epoch": 0.3055555555555556, | |
| "grad_norm": 6.683545112609863, | |
| "kl": 0.82958984375, | |
| "learning_rate": 2.9752402880375777e-06, | |
| "loss": 0.269, | |
| "reward": 0.3125000037252903, | |
| "reward_std": 0.3344954252243042, | |
| "rewards/accuracy_reward": 0.3125000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 1786.1667175292969, | |
| "epoch": 0.3090277777777778, | |
| "grad_norm": 221.86241149902344, | |
| "kl": 1.0849609375, | |
| "learning_rate": 2.9735670754226253e-06, | |
| "loss": 0.1301, | |
| "reward": 0.06250000186264515, | |
| "reward_std": 0.11558076366782188, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 1416.3958740234375, | |
| "epoch": 0.3125, | |
| "grad_norm": 3760907.5, | |
| "kl": 112641.357421875, | |
| "learning_rate": 2.9718396616198768e-06, | |
| "loss": 4501.6108, | |
| "reward": 0.31250000558793545, | |
| "reward_std": 0.31970489770174026, | |
| "rewards/accuracy_reward": 0.31250000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 1740.6666870117188, | |
| "epoch": 0.3159722222222222, | |
| "grad_norm": 4.726524353027344, | |
| "kl": 0.9208984375, | |
| "learning_rate": 2.9700581101675876e-06, | |
| "loss": 0.0862, | |
| "reward": 0.0416666679084301, | |
| "reward_std": 0.10206206887960434, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 1720.4792175292969, | |
| "epoch": 0.3194444444444444, | |
| "grad_norm": 63951.12890625, | |
| "kl": 696.65625, | |
| "learning_rate": 2.9682224865953187e-06, | |
| "loss": 34.2019, | |
| "reward": 0.08333333395421505, | |
| "reward_std": 0.16661179438233376, | |
| "rewards/accuracy_reward": 0.08333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 1665.2916870117188, | |
| "epoch": 0.3229166666666667, | |
| "grad_norm": 14.36121654510498, | |
| "kl": 0.8623046875, | |
| "learning_rate": 2.9663328584215296e-06, | |
| "loss": 0.1886, | |
| "reward": 0.1250000037252903, | |
| "reward_std": 0.23116152733564377, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 1466.1875610351562, | |
| "epoch": 0.3263888888888889, | |
| "grad_norm": 77008.15625, | |
| "kl": 899.75, | |
| "learning_rate": 2.9643892951510922e-06, | |
| "loss": 45.7416, | |
| "reward": 0.2500000037252903, | |
| "reward_std": 0.377695269882679, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 1895.6041870117188, | |
| "epoch": 0.3298611111111111, | |
| "grad_norm": 9872.4345703125, | |
| "kl": 89.375, | |
| "learning_rate": 2.9623918682727352e-06, | |
| "loss": 3.8829, | |
| "reward": 0.1041666679084301, | |
| "reward_std": 0.1705273911356926, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 1635.875, | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 6.599135875701904, | |
| "kl": 0.5634765625, | |
| "learning_rate": 2.9603406512564172e-06, | |
| "loss": 0.0839, | |
| "reward": 0.08333333395421505, | |
| "reward_std": 0.11949635669589043, | |
| "rewards/accuracy_reward": 0.08333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 1886.6667175292969, | |
| "epoch": 0.3368055555555556, | |
| "grad_norm": 6.1785430908203125, | |
| "kl": 0.55322265625, | |
| "learning_rate": 2.958235719550619e-06, | |
| "loss": 0.0737, | |
| "reward": 0.0416666679084301, | |
| "reward_std": 0.10206207633018494, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 1679.6667175292969, | |
| "epoch": 0.3402777777777778, | |
| "grad_norm": 1524.2078857421875, | |
| "kl": 8.71875, | |
| "learning_rate": 2.956077150579571e-06, | |
| "loss": 0.4625, | |
| "reward": 0.0416666679084301, | |
| "reward_std": 0.10206207633018494, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 1814.5208740234375, | |
| "epoch": 0.34375, | |
| "grad_norm": 13820.7841796875, | |
| "kl": 88.75, | |
| "learning_rate": 2.9538650237404094e-06, | |
| "loss": 3.7684, | |
| "reward": 0.0416666679084301, | |
| "reward_std": 0.10206207260489464, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 1842.8541870117188, | |
| "epoch": 0.3472222222222222, | |
| "grad_norm": 434.58612060546875, | |
| "kl": 12.96875, | |
| "learning_rate": 2.9515994204002487e-06, | |
| "loss": 0.5246, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 1808.5208740234375, | |
| "epoch": 0.3506944444444444, | |
| "grad_norm": 9.609249114990234, | |
| "kl": 0.7236328125, | |
| "learning_rate": 2.949280423893192e-06, | |
| "loss": 0.0294, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 1777.5833740234375, | |
| "epoch": 0.3541666666666667, | |
| "grad_norm": 3.4232985973358154, | |
| "kl": 0.5830078125, | |
| "learning_rate": 2.946908119517268e-06, | |
| "loss": 0.0237, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 1784.5625610351562, | |
| "epoch": 0.3576388888888889, | |
| "grad_norm": 1.636131763458252, | |
| "kl": 0.42236328125, | |
| "learning_rate": 2.9444825945312897e-06, | |
| "loss": 0.0973, | |
| "reward": 0.06250000186264515, | |
| "reward_std": 0.11558076366782188, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 1727.0625610351562, | |
| "epoch": 0.3611111111111111, | |
| "grad_norm": 1.850907802581787, | |
| "kl": 0.3369140625, | |
| "learning_rate": 2.9420039381516486e-06, | |
| "loss": 0.1347, | |
| "reward": 0.0833333358168602, | |
| "reward_std": 0.20412415266036987, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 1911.0416870117188, | |
| "epoch": 0.3645833333333333, | |
| "grad_norm": 1.0294091701507568, | |
| "kl": 0.5341796875, | |
| "learning_rate": 2.9394722415490285e-06, | |
| "loss": 0.0662, | |
| "reward": 0.02083333395421505, | |
| "reward_std": 0.05103103443980217, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 1875.8542175292969, | |
| "epoch": 0.3680555555555556, | |
| "grad_norm": 7.6678242683410645, | |
| "kl": 0.36767578125, | |
| "learning_rate": 2.936887597845057e-06, | |
| "loss": 0.0401, | |
| "reward": 0.02083333395421505, | |
| "reward_std": 0.05103103443980217, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 1904.6250610351562, | |
| "epoch": 0.3715277777777778, | |
| "grad_norm": 335.5174865722656, | |
| "kl": 5.96875, | |
| "learning_rate": 2.9342501021088764e-06, | |
| "loss": 0.2439, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 1972.3541870117188, | |
| "epoch": 0.375, | |
| "grad_norm": 10.273797035217285, | |
| "kl": 0.6005859375, | |
| "learning_rate": 2.9315598513536496e-06, | |
| "loss": 0.0241, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 1954.5833435058594, | |
| "epoch": 0.3784722222222222, | |
| "grad_norm": 165.60858154296875, | |
| "kl": 21.5576171875, | |
| "learning_rate": 2.9288169445329886e-06, | |
| "loss": 0.9027, | |
| "reward": 0.02083333395421505, | |
| "reward_std": 0.05103103443980217, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 1816.0833740234375, | |
| "epoch": 0.3819444444444444, | |
| "grad_norm": 128.80422973632812, | |
| "kl": 35.7939453125, | |
| "learning_rate": 2.9260214825373185e-06, | |
| "loss": 1.4108, | |
| "reward": 0.10416666977107525, | |
| "reward_std": 0.11558076366782188, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 2030.4791870117188, | |
| "epoch": 0.3854166666666667, | |
| "grad_norm": 13.686920166015625, | |
| "kl": 0.505859375, | |
| "learning_rate": 2.9231735681901645e-06, | |
| "loss": 0.0203, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 2024.0833435058594, | |
| "epoch": 0.3888888888888889, | |
| "grad_norm": 1.7501544952392578, | |
| "kl": 0.3896484375, | |
| "learning_rate": 2.9202733062443688e-06, | |
| "loss": 0.0156, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 2004.9791870117188, | |
| "epoch": 0.3923611111111111, | |
| "grad_norm": 226.14535522460938, | |
| "kl": 20.076171875, | |
| "learning_rate": 2.9173208033782398e-06, | |
| "loss": 0.9243, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 1985.5833435058594, | |
| "epoch": 0.3958333333333333, | |
| "grad_norm": 2.1941516399383545, | |
| "kl": 0.244140625, | |
| "learning_rate": 2.9143161681916264e-06, | |
| "loss": 0.0098, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 1845.3333740234375, | |
| "epoch": 0.3993055555555556, | |
| "grad_norm": 13.305416107177734, | |
| "kl": 4.19580078125, | |
| "learning_rate": 2.9112595112019248e-06, | |
| "loss": 0.2172, | |
| "reward": 0.06250000186264515, | |
| "reward_std": 0.11558076739311218, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 2007.2708435058594, | |
| "epoch": 0.4027777777777778, | |
| "grad_norm": 7.043557643890381, | |
| "kl": 2.256591796875, | |
| "learning_rate": 2.908150944840013e-06, | |
| "loss": 0.0791, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 1877.9375610351562, | |
| "epoch": 0.40625, | |
| "grad_norm": 118.13216400146484, | |
| "kl": 1.447265625, | |
| "learning_rate": 2.9049905834461144e-06, | |
| "loss": 0.0626, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 1961.5416870117188, | |
| "epoch": 0.4097222222222222, | |
| "grad_norm": 0.68906170129776, | |
| "kl": 0.207763671875, | |
| "learning_rate": 2.9017785432655936e-06, | |
| "loss": 0.0083, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 1707.0417175292969, | |
| "epoch": 0.4131944444444444, | |
| "grad_norm": 8.199712753295898, | |
| "kl": 0.417236328125, | |
| "learning_rate": 2.898514942444679e-06, | |
| "loss": 0.1492, | |
| "reward": 0.1041666679084301, | |
| "reward_std": 0.13301505148410797, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 1773.0833740234375, | |
| "epoch": 0.4166666666666667, | |
| "grad_norm": 0.5564130544662476, | |
| "kl": 0.335693359375, | |
| "learning_rate": 2.8951999010261185e-06, | |
| "loss": 0.0165, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 1937.2500305175781, | |
| "epoch": 0.4201388888888889, | |
| "grad_norm": 0.38130059838294983, | |
| "kl": 0.62548828125, | |
| "learning_rate": 2.891833540944764e-06, | |
| "loss": 0.0249, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 1825.9583740234375, | |
| "epoch": 0.4236111111111111, | |
| "grad_norm": 1.3825017213821411, | |
| "kl": 0.607177734375, | |
| "learning_rate": 2.888415986023086e-06, | |
| "loss": 0.0564, | |
| "reward": 0.02083333395421505, | |
| "reward_std": 0.05103103443980217, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 1638.5833740234375, | |
| "epoch": 0.4270833333333333, | |
| "grad_norm": 2.1698951721191406, | |
| "kl": 0.1492919921875, | |
| "learning_rate": 2.8849473619666165e-06, | |
| "loss": 0.1233, | |
| "reward": 0.1041666716337204, | |
| "reward_std": 0.11558075994253159, | |
| "rewards/accuracy_reward": 0.1041666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 1736.3125305175781, | |
| "epoch": 0.4305555555555556, | |
| "grad_norm": 0.5968286395072937, | |
| "kl": 1.01513671875, | |
| "learning_rate": 2.8814277963593322e-06, | |
| "loss": 0.1257, | |
| "reward": 0.08333333395421505, | |
| "reward_std": 0.11949635669589043, | |
| "rewards/accuracy_reward": 0.08333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 1638.1875305175781, | |
| "epoch": 0.4340277777777778, | |
| "grad_norm": 3.021986246109009, | |
| "kl": 0.81005859375, | |
| "learning_rate": 2.877857418658953e-06, | |
| "loss": 0.1744, | |
| "reward": 0.1041666716337204, | |
| "reward_std": 0.13301503658294678, | |
| "rewards/accuracy_reward": 0.1041666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 1677.5208740234375, | |
| "epoch": 0.4375, | |
| "grad_norm": 1.4195492267608643, | |
| "kl": 0.66552734375, | |
| "learning_rate": 2.8742363601921877e-06, | |
| "loss": 0.0705, | |
| "reward": 0.06250000186264515, | |
| "reward_std": 0.11558076366782188, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 1674.1875305175781, | |
| "epoch": 0.4409722222222222, | |
| "grad_norm": 717.3495483398438, | |
| "kl": 12.380859375, | |
| "learning_rate": 2.870564754149898e-06, | |
| "loss": 0.5629, | |
| "reward": 0.0416666679084301, | |
| "reward_std": 0.10206206887960434, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 1644.8541870117188, | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 0.4486486315727234, | |
| "kl": 0.634765625, | |
| "learning_rate": 2.866842735582204e-06, | |
| "loss": 0.0598, | |
| "reward": 0.0416666679084301, | |
| "reward_std": 0.06454972922801971, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 1703.4583740234375, | |
| "epoch": 0.4479166666666667, | |
| "grad_norm": 0.45255520939826965, | |
| "kl": 0.7451171875, | |
| "learning_rate": 2.863070441393511e-06, | |
| "loss": 0.0303, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 1737.1875305175781, | |
| "epoch": 0.4513888888888889, | |
| "grad_norm": 0.6101142168045044, | |
| "kl": 0.546875, | |
| "learning_rate": 2.859248010337482e-06, | |
| "loss": 0.0228, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 1529.8750610351562, | |
| "epoch": 0.4548611111111111, | |
| "grad_norm": 2.965920925140381, | |
| "kl": 0.5712890625, | |
| "learning_rate": 2.8553755830119252e-06, | |
| "loss": 0.0507, | |
| "reward": 0.02083333395421505, | |
| "reward_std": 0.05103103443980217, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 1692.9583740234375, | |
| "epoch": 0.4583333333333333, | |
| "grad_norm": 0.2750244736671448, | |
| "kl": 0.478515625, | |
| "learning_rate": 2.851453301853629e-06, | |
| "loss": 0.0905, | |
| "reward": 0.0416666679084301, | |
| "reward_std": 0.10206207633018494, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 1811.3750305175781, | |
| "epoch": 0.4618055555555556, | |
| "grad_norm": 0.274085134267807, | |
| "kl": 0.4521484375, | |
| "learning_rate": 2.8474813111331196e-06, | |
| "loss": 0.0184, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 1747.7708740234375, | |
| "epoch": 0.4652777777777778, | |
| "grad_norm": 0.4988988935947418, | |
| "kl": 0.34716796875, | |
| "learning_rate": 2.8434597569493546e-06, | |
| "loss": 0.0494, | |
| "reward": 0.02083333395421505, | |
| "reward_std": 0.05103103443980217, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 1768.6041870117188, | |
| "epoch": 0.46875, | |
| "grad_norm": 0.22119854390621185, | |
| "kl": 0.359375, | |
| "learning_rate": 2.8393887872243527e-06, | |
| "loss": 0.0136, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 1694.3542175292969, | |
| "epoch": 0.4722222222222222, | |
| "grad_norm": 0.36002039909362793, | |
| "kl": 0.288330078125, | |
| "learning_rate": 2.8352685516977465e-06, | |
| "loss": 0.0739, | |
| "reward": 0.0625, | |
| "reward_std": 0.06846532225608826, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/format_reward": 0.0, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 1842.9792175292969, | |
| "epoch": 0.4756944444444444, | |
| "grad_norm": 0.7425448894500732, | |
| "kl": 0.30810546875, | |
| "learning_rate": 2.83109920192128e-06, | |
| "loss": 0.0752, | |
| "reward": 0.0416666679084301, | |
| "reward_std": 0.10206207260489464, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 1778.2708435058594, | |
| "epoch": 0.4791666666666667, | |
| "grad_norm": 0.5278202295303345, | |
| "kl": 0.216796875, | |
| "learning_rate": 2.8268808912532317e-06, | |
| "loss": 0.0659, | |
| "reward": 0.0416666679084301, | |
| "reward_std": 0.10206206887960434, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 1660.6458740234375, | |
| "epoch": 0.4826388888888889, | |
| "grad_norm": 0.35345280170440674, | |
| "kl": 0.32080078125, | |
| "learning_rate": 2.8226137748527746e-06, | |
| "loss": 0.0371, | |
| "reward": 0.02083333395421505, | |
| "reward_std": 0.05103103443980217, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 1822.1667175292969, | |
| "epoch": 0.4861111111111111, | |
| "grad_norm": 0.44432520866394043, | |
| "kl": 0.436767578125, | |
| "learning_rate": 2.8182980096742684e-06, | |
| "loss": 0.0184, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 1910.9583740234375, | |
| "epoch": 0.4895833333333333, | |
| "grad_norm": 0.5671920776367188, | |
| "kl": 0.534912109375, | |
| "learning_rate": 2.8139337544614882e-06, | |
| "loss": 0.0226, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 1698.8541870117188, | |
| "epoch": 0.4930555555555556, | |
| "grad_norm": 1.235549807548523, | |
| "kl": 0.1435546875, | |
| "learning_rate": 2.8095211697417823e-06, | |
| "loss": 0.0771, | |
| "reward": 0.1041666679084301, | |
| "reward_std": 0.13301505148410797, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 1808.2917175292969, | |
| "epoch": 0.4965277777777778, | |
| "grad_norm": 0.7526421546936035, | |
| "kl": 0.2568359375, | |
| "learning_rate": 2.8050604178201705e-06, | |
| "loss": 0.079, | |
| "reward": 0.10416666977107525, | |
| "reward_std": 0.1705274060368538, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 1631.3541870117188, | |
| "epoch": 0.5, | |
| "grad_norm": 1.259453535079956, | |
| "kl": 0.830078125, | |
| "learning_rate": 2.8005516627733725e-06, | |
| "loss": 0.1079, | |
| "reward": 0.06250000186264515, | |
| "reward_std": 0.11558076366782188, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 1280.0625305175781, | |
| "epoch": 0.5034722222222222, | |
| "grad_norm": 3.7468371391296387, | |
| "kl": 0.6396484375, | |
| "learning_rate": 2.7959950704437755e-06, | |
| "loss": 0.1834, | |
| "reward": 0.18750000186264515, | |
| "reward_std": 0.2996268458664417, | |
| "rewards/accuracy_reward": 0.18750000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 1411.4375610351562, | |
| "epoch": 0.5069444444444444, | |
| "grad_norm": 2.2973523139953613, | |
| "kl": 2.177734375, | |
| "learning_rate": 2.791390808433328e-06, | |
| "loss": 0.3023, | |
| "reward": 0.18750000186264515, | |
| "reward_std": 0.2525114081799984, | |
| "rewards/accuracy_reward": 0.18750000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 1717.8750305175781, | |
| "epoch": 0.5104166666666666, | |
| "grad_norm": 11.517189979553223, | |
| "kl": 2.87109375, | |
| "learning_rate": 2.786739046097383e-06, | |
| "loss": 0.2817, | |
| "reward": 0.1666666716337204, | |
| "reward_std": 0.24859582632780075, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 1590.0000305175781, | |
| "epoch": 0.5138888888888888, | |
| "grad_norm": 3.584110975265503, | |
| "kl": 2.015625, | |
| "learning_rate": 2.7820399545384623e-06, | |
| "loss": 0.2711, | |
| "reward": 0.2291666753590107, | |
| "reward_std": 0.2621144950389862, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 1512.2292175292969, | |
| "epoch": 0.5173611111111112, | |
| "grad_norm": 46.85298538208008, | |
| "kl": 1.19775390625, | |
| "learning_rate": 2.777293706599967e-06, | |
| "loss": 0.3577, | |
| "reward": 0.33333333395421505, | |
| "reward_std": 0.4056045077741146, | |
| "rewards/accuracy_reward": 0.33333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 1826.6666870117188, | |
| "epoch": 0.5208333333333334, | |
| "grad_norm": 131134.640625, | |
| "kl": 111.875, | |
| "learning_rate": 2.7725004768598174e-06, | |
| "loss": 4.9777, | |
| "reward": 0.0833333358168602, | |
| "reward_std": 0.16661180555820465, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 1459.2709045410156, | |
| "epoch": 0.5243055555555556, | |
| "grad_norm": 3877.8349609375, | |
| "kl": 12.62890625, | |
| "learning_rate": 2.7676604416240326e-06, | |
| "loss": 0.7876, | |
| "reward": 0.2708333395421505, | |
| "reward_std": 0.3170611336827278, | |
| "rewards/accuracy_reward": 0.2708333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 1779.8333740234375, | |
| "epoch": 0.5277777777777778, | |
| "grad_norm": 105.57654571533203, | |
| "kl": 8.2080078125, | |
| "learning_rate": 2.7627737789202467e-06, | |
| "loss": 0.4021, | |
| "reward": 0.06250000186264515, | |
| "reward_std": 0.1530931033194065, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 1697.5000610351562, | |
| "epoch": 0.53125, | |
| "grad_norm": 42.35576248168945, | |
| "kl": 5.669921875, | |
| "learning_rate": 2.757840668491157e-06, | |
| "loss": 0.372, | |
| "reward": 0.1875000037252903, | |
| "reward_std": 0.19756478071212769, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 1613.4583435058594, | |
| "epoch": 0.5347222222222222, | |
| "grad_norm": 17.527334213256836, | |
| "kl": 1.99609375, | |
| "learning_rate": 2.7528612917879175e-06, | |
| "loss": 0.2797, | |
| "reward": 0.27083333767950535, | |
| "reward_std": 0.3720077760517597, | |
| "rewards/accuracy_reward": 0.27083333767950535, | |
| "rewards/format_reward": 0.0, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 1423.2917175292969, | |
| "epoch": 0.5381944444444444, | |
| "grad_norm": 42.86451721191406, | |
| "kl": 2.865234375, | |
| "learning_rate": 2.747835831963461e-06, | |
| "loss": 0.4037, | |
| "reward": 0.3333333507180214, | |
| "reward_std": 0.3680921643972397, | |
| "rewards/accuracy_reward": 0.3333333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 1166.437515258789, | |
| "epoch": 0.5416666666666666, | |
| "grad_norm": 4.073652267456055, | |
| "kl": 2.2734375, | |
| "learning_rate": 2.7427644738657634e-06, | |
| "loss": 0.3778, | |
| "reward": 0.41666668467223644, | |
| "reward_std": 0.2861081659793854, | |
| "rewards/accuracy_reward": 0.41666668467223644, | |
| "rewards/format_reward": 0.0, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 1296.8750305175781, | |
| "epoch": 0.5451388888888888, | |
| "grad_norm": 47.81319046020508, | |
| "kl": 10.375, | |
| "learning_rate": 2.737647404031045e-06, | |
| "loss": 0.8308, | |
| "reward": 0.354166679084301, | |
| "reward_std": 0.42872629314661026, | |
| "rewards/accuracy_reward": 0.354166679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 1627.2291870117188, | |
| "epoch": 0.5486111111111112, | |
| "grad_norm": 9.572442054748535, | |
| "kl": 4.671875, | |
| "learning_rate": 2.7324848106769096e-06, | |
| "loss": 0.3456, | |
| "reward": 0.16666667349636555, | |
| "reward_std": 0.23116152361035347, | |
| "rewards/accuracy_reward": 0.16666667349636555, | |
| "rewards/format_reward": 0.0, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 1242.4792175292969, | |
| "epoch": 0.5520833333333334, | |
| "grad_norm": 4.172027111053467, | |
| "kl": 0.90380859375, | |
| "learning_rate": 2.72727688369542e-06, | |
| "loss": 0.2322, | |
| "reward": 0.3958333469927311, | |
| "reward_std": 0.37377967685461044, | |
| "rewards/accuracy_reward": 0.3958333469927311, | |
| "rewards/format_reward": 0.0, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 1467.1250610351562, | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 9.006033897399902, | |
| "kl": 1.0126953125, | |
| "learning_rate": 2.722023814646113e-06, | |
| "loss": 0.2192, | |
| "reward": 0.22916667349636555, | |
| "reward_std": 0.27258946746587753, | |
| "rewards/accuracy_reward": 0.22916667349636555, | |
| "rewards/format_reward": 0.0, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 799.0625457763672, | |
| "epoch": 0.5590277777777778, | |
| "grad_norm": 1064.71875, | |
| "kl": 10.662109375, | |
| "learning_rate": 2.7167257967489577e-06, | |
| "loss": 0.8803, | |
| "reward": 0.375, | |
| "reward_std": 0.3680921643972397, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 0.0, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 1014.2708587646484, | |
| "epoch": 0.5625, | |
| "grad_norm": 400.89520263671875, | |
| "kl": 34.6171875, | |
| "learning_rate": 2.7113830248772426e-06, | |
| "loss": 2.2489, | |
| "reward": 0.2083333395421505, | |
| "reward_std": 0.2861081697046757, | |
| "rewards/accuracy_reward": 0.2083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 696.875, | |
| "epoch": 0.5659722222222222, | |
| "grad_norm": 8.835132598876953, | |
| "kl": 2.4921875, | |
| "learning_rate": 2.705995695550411e-06, | |
| "loss": 0.2677, | |
| "reward": 0.3541666828095913, | |
| "reward_std": 0.36417656019330025, | |
| "rewards/accuracy_reward": 0.3541666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 849.3333587646484, | |
| "epoch": 0.5694444444444444, | |
| "grad_norm": 25.482391357421875, | |
| "kl": 4.75390625, | |
| "learning_rate": 2.7005640069268325e-06, | |
| "loss": 0.5359, | |
| "reward": 0.4375000186264515, | |
| "reward_std": 0.48367293179035187, | |
| "rewards/accuracy_reward": 0.4375000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 798.895866394043, | |
| "epoch": 0.5729166666666666, | |
| "grad_norm": 123.7555160522461, | |
| "kl": 21.21875, | |
| "learning_rate": 2.695088158796513e-06, | |
| "loss": 1.046, | |
| "reward": 0.3750000149011612, | |
| "reward_std": 0.24859582632780075, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 911.1250305175781, | |
| "epoch": 0.5763888888888888, | |
| "grad_norm": 40.9791259765625, | |
| "kl": 1.25, | |
| "learning_rate": 2.6895683525737467e-06, | |
| "loss": 0.1436, | |
| "reward": 0.2291666716337204, | |
| "reward_std": 0.317061148583889, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 1438.0417175292969, | |
| "epoch": 0.5798611111111112, | |
| "grad_norm": 8.88305377960205, | |
| "kl": 0.955078125, | |
| "learning_rate": 2.6840047912897087e-06, | |
| "loss": 0.1882, | |
| "reward": 0.1666666679084301, | |
| "reward_std": 0.23116152733564377, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 1704.1666870117188, | |
| "epoch": 0.5833333333333334, | |
| "grad_norm": 1.0943595170974731, | |
| "kl": 1.3603515625, | |
| "learning_rate": 2.6783976795849845e-06, | |
| "loss": 0.1345, | |
| "reward": 0.1458333395421505, | |
| "reward_std": 0.1530931033194065, | |
| "rewards/accuracy_reward": 0.1458333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 1742.9375610351562, | |
| "epoch": 0.5868055555555556, | |
| "grad_norm": 3.9962105751037598, | |
| "kl": 1.18359375, | |
| "learning_rate": 2.6727472237020448e-06, | |
| "loss": 0.1885, | |
| "reward": 0.12500000186264515, | |
| "reward_std": 0.18404608964920044, | |
| "rewards/accuracy_reward": 0.12500000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 1521.3333587646484, | |
| "epoch": 0.5902777777777778, | |
| "grad_norm": 3.4218358993530273, | |
| "kl": 1.43408203125, | |
| "learning_rate": 2.6670536314776595e-06, | |
| "loss": 0.2947, | |
| "reward": 0.229166679084301, | |
| "reward_std": 0.29962684214115143, | |
| "rewards/accuracy_reward": 0.229166679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 1536.0000305175781, | |
| "epoch": 0.59375, | |
| "grad_norm": 15.191433906555176, | |
| "kl": 5.6875, | |
| "learning_rate": 2.661317112335251e-06, | |
| "loss": 0.4542, | |
| "reward": 0.1875000037252903, | |
| "reward_std": 0.2525114193558693, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 1296.8125305175781, | |
| "epoch": 0.5972222222222222, | |
| "grad_norm": 74.45282745361328, | |
| "kl": 3.84765625, | |
| "learning_rate": 2.655537877277195e-06, | |
| "loss": 0.3666, | |
| "reward": 0.3958333432674408, | |
| "reward_std": 0.3170611336827278, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 1482.3333740234375, | |
| "epoch": 0.6006944444444444, | |
| "grad_norm": 13.935741424560547, | |
| "kl": 31.96484375, | |
| "learning_rate": 2.6497161388770536e-06, | |
| "loss": 0.2814, | |
| "reward": 0.2291666716337204, | |
| "reward_std": 0.2621144950389862, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 1709.7083740234375, | |
| "epoch": 0.6041666666666666, | |
| "grad_norm": 7.259042739868164, | |
| "kl": 2.337890625, | |
| "learning_rate": 2.643852111271762e-06, | |
| "loss": 0.3467, | |
| "reward": 0.22916666977107525, | |
| "reward_std": 0.35457348451018333, | |
| "rewards/accuracy_reward": 0.22916666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 1677.9375610351562, | |
| "epoch": 0.6076388888888888, | |
| "grad_norm": 1.048950433731079, | |
| "kl": 2.38671875, | |
| "learning_rate": 2.6379460101537488e-06, | |
| "loss": 0.1757, | |
| "reward": 0.1041666679084301, | |
| "reward_std": 0.13301505148410797, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 1645.9584045410156, | |
| "epoch": 0.6111111111111112, | |
| "grad_norm": 2.06364107131958, | |
| "kl": 1.40234375, | |
| "learning_rate": 2.6319980527630044e-06, | |
| "loss": 0.1798, | |
| "reward": 0.20833334140479565, | |
| "reward_std": 0.23116152733564377, | |
| "rewards/accuracy_reward": 0.20833334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 1709.9166870117188, | |
| "epoch": 0.6145833333333334, | |
| "grad_norm": 0.5783344507217407, | |
| "kl": 2.28515625, | |
| "learning_rate": 2.6260084578790864e-06, | |
| "loss": 0.2011, | |
| "reward": 0.10416666977107525, | |
| "reward_std": 0.11558076739311218, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 1813.6250610351562, | |
| "epoch": 0.6180555555555556, | |
| "grad_norm": 1.2280815839767456, | |
| "kl": 1.638671875, | |
| "learning_rate": 2.6199774458130785e-06, | |
| "loss": 0.1822, | |
| "reward": 0.10416666977107525, | |
| "reward_std": 0.1801304966211319, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 1520.0000305175781, | |
| "epoch": 0.6215277777777778, | |
| "grad_norm": 2.869910717010498, | |
| "kl": 2.11328125, | |
| "learning_rate": 2.613905238399482e-06, | |
| "loss": 0.2802, | |
| "reward": 0.22916666977107525, | |
| "reward_std": 0.38425464555621147, | |
| "rewards/accuracy_reward": 0.22916666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 1496.8125305175781, | |
| "epoch": 0.625, | |
| "grad_norm": 2.689152240753174, | |
| "kl": 3.96484375, | |
| "learning_rate": 2.607792058988057e-06, | |
| "loss": 0.3758, | |
| "reward": 0.2083333395421505, | |
| "reward_std": 0.3035424277186394, | |
| "rewards/accuracy_reward": 0.2083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 1637.0208740234375, | |
| "epoch": 0.6284722222222222, | |
| "grad_norm": 2.250966787338257, | |
| "kl": 3.62890625, | |
| "learning_rate": 2.601638132435611e-06, | |
| "loss": 0.3209, | |
| "reward": 0.20833334140479565, | |
| "reward_std": 0.24859580025076866, | |
| "rewards/accuracy_reward": 0.20833334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 1670.7083740234375, | |
| "epoch": 0.6319444444444444, | |
| "grad_norm": 1.095352053642273, | |
| "kl": 2.46484375, | |
| "learning_rate": 2.595443685097721e-06, | |
| "loss": 0.1527, | |
| "reward": 0.1250000037252903, | |
| "reward_std": 0.16661179810762405, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 1489.5000305175781, | |
| "epoch": 0.6354166666666666, | |
| "grad_norm": 2.259946346282959, | |
| "kl": 0.67138671875, | |
| "learning_rate": 2.589208944820415e-06, | |
| "loss": 0.1956, | |
| "reward": 0.16666666977107525, | |
| "reward_std": 0.2861081659793854, | |
| "rewards/accuracy_reward": 0.16666666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 1528.0208740234375, | |
| "epoch": 0.6388888888888888, | |
| "grad_norm": 1.399794578552246, | |
| "kl": 0.407470703125, | |
| "learning_rate": 2.5829341409317867e-06, | |
| "loss": 0.0884, | |
| "reward": 0.2291666753590107, | |
| "reward_std": 0.21764283999800682, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 1513.1667175292969, | |
| "epoch": 0.6423611111111112, | |
| "grad_norm": 1.0213326215744019, | |
| "kl": 0.502685546875, | |
| "learning_rate": 2.5766195042335618e-06, | |
| "loss": 0.2003, | |
| "reward": 0.3333333469927311, | |
| "reward_std": 0.23116152733564377, | |
| "rewards/accuracy_reward": 0.3333333469927311, | |
| "rewards/format_reward": 0.0, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 1259.1875305175781, | |
| "epoch": 0.6458333333333334, | |
| "grad_norm": 1.9825993776321411, | |
| "kl": 0.530029296875, | |
| "learning_rate": 2.5702652669926085e-06, | |
| "loss": 0.3958, | |
| "reward": 0.4375000111758709, | |
| "reward_std": 0.44616056233644485, | |
| "rewards/accuracy_reward": 0.4375000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 1536.0833435058594, | |
| "epoch": 0.6493055555555556, | |
| "grad_norm": 2.610755205154419, | |
| "kl": 1.0009765625, | |
| "learning_rate": 2.5638716629323936e-06, | |
| "loss": 0.2405, | |
| "reward": 0.22916667349636555, | |
| "reward_std": 0.3170611262321472, | |
| "rewards/accuracy_reward": 0.22916667349636555, | |
| "rewards/format_reward": 0.0, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 1389.8958435058594, | |
| "epoch": 0.6527777777777778, | |
| "grad_norm": 8.625247955322266, | |
| "kl": 2.216796875, | |
| "learning_rate": 2.5574389272243876e-06, | |
| "loss": 0.1401, | |
| "reward": 0.1458333358168602, | |
| "reward_std": 0.2350771203637123, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 1405.2917175292969, | |
| "epoch": 0.65625, | |
| "grad_norm": 5.041468143463135, | |
| "kl": 4.9609375, | |
| "learning_rate": 2.5509672964794102e-06, | |
| "loss": 0.4101, | |
| "reward": 0.39583334513008595, | |
| "reward_std": 0.29962683469057083, | |
| "rewards/accuracy_reward": 0.39583334513008595, | |
| "rewards/format_reward": 0.0, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 1349.0833892822266, | |
| "epoch": 0.6597222222222222, | |
| "grad_norm": 8.213921546936035, | |
| "kl": 6.62890625, | |
| "learning_rate": 2.544457008738933e-06, | |
| "loss": 0.5199, | |
| "reward": 0.2083333395421505, | |
| "reward_std": 0.23116152733564377, | |
| "rewards/accuracy_reward": 0.2083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 1245.6667175292969, | |
| "epoch": 0.6631944444444444, | |
| "grad_norm": 1184.245849609375, | |
| "kl": 21.8701171875, | |
| "learning_rate": 2.5379083034663196e-06, | |
| "loss": 2.5309, | |
| "reward": 0.4166666865348816, | |
| "reward_std": 0.350657869130373, | |
| "rewards/accuracy_reward": 0.4166666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 1279.7083435058594, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 46.52630615234375, | |
| "kl": 1.13671875, | |
| "learning_rate": 2.5313214215380183e-06, | |
| "loss": 0.3942, | |
| "reward": 0.4166666716337204, | |
| "reward_std": 0.4056045040488243, | |
| "rewards/accuracy_reward": 0.4166666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 1142.604232788086, | |
| "epoch": 0.6701388888888888, | |
| "grad_norm": 7.017965316772461, | |
| "kl": 1.408203125, | |
| "learning_rate": 2.5246966052347035e-06, | |
| "loss": 0.3899, | |
| "reward": 0.5000000074505806, | |
| "reward_std": 0.2861081659793854, | |
| "rewards/accuracy_reward": 0.5000000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 1281.0000305175781, | |
| "epoch": 0.6736111111111112, | |
| "grad_norm": 2.2156763076782227, | |
| "kl": 3.1796875, | |
| "learning_rate": 2.518034098232363e-06, | |
| "loss": 0.198, | |
| "reward": 0.354166679084301, | |
| "reward_std": 0.2350771315395832, | |
| "rewards/accuracy_reward": 0.354166679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 1260.8958587646484, | |
| "epoch": 0.6770833333333334, | |
| "grad_norm": 3.9772746562957764, | |
| "kl": 3.33203125, | |
| "learning_rate": 2.511334145593335e-06, | |
| "loss": 0.4052, | |
| "reward": 0.35416668094694614, | |
| "reward_std": 0.309229951351881, | |
| "rewards/accuracy_reward": 0.35416668094694614, | |
| "rewards/format_reward": 0.0, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 1400.2916717529297, | |
| "epoch": 0.6805555555555556, | |
| "grad_norm": 4.802979946136475, | |
| "kl": 4.3779296875, | |
| "learning_rate": 2.5045969937572946e-06, | |
| "loss": 0.3441, | |
| "reward": 0.229166679084301, | |
| "reward_std": 0.1801304966211319, | |
| "rewards/accuracy_reward": 0.229166679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 1197.3542022705078, | |
| "epoch": 0.6840277777777778, | |
| "grad_norm": 2.4317538738250732, | |
| "kl": 2.39404296875, | |
| "learning_rate": 2.497822890532189e-06, | |
| "loss": 0.2276, | |
| "reward": 0.2291666679084301, | |
| "reward_std": 0.1705273911356926, | |
| "rewards/accuracy_reward": 0.2291666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 1415.8333740234375, | |
| "epoch": 0.6875, | |
| "grad_norm": 6.517379283905029, | |
| "kl": 1.5166015625, | |
| "learning_rate": 2.4910120850851222e-06, | |
| "loss": 0.3047, | |
| "reward": 0.2916666716337204, | |
| "reward_std": 0.31314554065465927, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 1371.8333740234375, | |
| "epoch": 0.6909722222222222, | |
| "grad_norm": 11.302325248718262, | |
| "kl": 2.34765625, | |
| "learning_rate": 2.484164827933191e-06, | |
| "loss": 0.5199, | |
| "reward": 0.3541666679084301, | |
| "reward_std": 0.44616059213876724, | |
| "rewards/accuracy_reward": 0.3541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 938.8958435058594, | |
| "epoch": 0.6944444444444444, | |
| "grad_norm": 4.46142053604126, | |
| "kl": 2.26953125, | |
| "learning_rate": 2.477281370934269e-06, | |
| "loss": 0.4124, | |
| "reward": 0.5208333358168602, | |
| "reward_std": 0.36417657136917114, | |
| "rewards/accuracy_reward": 0.5208333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 1234.7083740234375, | |
| "epoch": 0.6979166666666666, | |
| "grad_norm": 28.580671310424805, | |
| "kl": 14.4609375, | |
| "learning_rate": 2.4703619672777437e-06, | |
| "loss": 1.0409, | |
| "reward": 0.3750000074505806, | |
| "reward_std": 0.32274864614009857, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 1200.312515258789, | |
| "epoch": 0.7013888888888888, | |
| "grad_norm": 34.20479965209961, | |
| "kl": 16.609375, | |
| "learning_rate": 2.463406871475204e-06, | |
| "loss": 1.4472, | |
| "reward": 0.5208333507180214, | |
| "reward_std": 0.45399176329374313, | |
| "rewards/accuracy_reward": 0.5208333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 1110.2291870117188, | |
| "epoch": 0.7048611111111112, | |
| "grad_norm": 13.847625732421875, | |
| "kl": 6.369071960449219, | |
| "learning_rate": 2.4564163393510773e-06, | |
| "loss": 0.556, | |
| "reward": 0.4375000111758709, | |
| "reward_std": 0.2350771129131317, | |
| "rewards/accuracy_reward": 0.4375000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 1140.9583892822266, | |
| "epoch": 0.7083333333333334, | |
| "grad_norm": 3.1892592906951904, | |
| "kl": 4.072265625, | |
| "learning_rate": 2.449390628033221e-06, | |
| "loss": 0.447, | |
| "reward": 0.416666679084301, | |
| "reward_std": 0.4701542556285858, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 1346.6458587646484, | |
| "epoch": 0.7118055555555556, | |
| "grad_norm": 7.5219502449035645, | |
| "kl": 2.79296875, | |
| "learning_rate": 2.442329995943464e-06, | |
| "loss": 0.5224, | |
| "reward": 0.3750000074505806, | |
| "reward_std": 0.4230388179421425, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 1420.5208740234375, | |
| "epoch": 0.7152777777777778, | |
| "grad_norm": 0.7026904821395874, | |
| "kl": 1.548828125, | |
| "learning_rate": 2.4352347027881005e-06, | |
| "loss": 0.2904, | |
| "reward": 0.2708333432674408, | |
| "reward_std": 0.21764282882213593, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 1102.5625305175781, | |
| "epoch": 0.71875, | |
| "grad_norm": 0.971058189868927, | |
| "kl": 0.6015625, | |
| "learning_rate": 2.4281050095483407e-06, | |
| "loss": 0.1603, | |
| "reward": 0.3541666716337204, | |
| "reward_std": 0.18796169012784958, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 1038.3958740234375, | |
| "epoch": 0.7222222222222222, | |
| "grad_norm": 0.8834924101829529, | |
| "kl": 0.481689453125, | |
| "learning_rate": 2.4209411784707064e-06, | |
| "loss": 0.2426, | |
| "reward": 0.5208333432674408, | |
| "reward_std": 0.1705274060368538, | |
| "rewards/accuracy_reward": 0.5208333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 1390.1042175292969, | |
| "epoch": 0.7256944444444444, | |
| "grad_norm": 0.45683708786964417, | |
| "kl": 0.99072265625, | |
| "learning_rate": 2.4137434730573906e-06, | |
| "loss": 0.1915, | |
| "reward": 0.2291666716337204, | |
| "reward_std": 0.2621145099401474, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 1307.0625305175781, | |
| "epoch": 0.7291666666666666, | |
| "grad_norm": 2.871281862258911, | |
| "kl": 1.170166015625, | |
| "learning_rate": 2.40651215805656e-06, | |
| "loss": 0.2271, | |
| "reward": 0.41666668839752674, | |
| "reward_std": 0.3131455294787884, | |
| "rewards/accuracy_reward": 0.41666668839752674, | |
| "rewards/format_reward": 0.0, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 1344.3958740234375, | |
| "epoch": 0.7326388888888888, | |
| "grad_norm": 0.6937971711158752, | |
| "kl": 0.9228515625, | |
| "learning_rate": 2.39924749945262e-06, | |
| "loss": 0.1508, | |
| "reward": 0.20833334140479565, | |
| "reward_std": 0.2686738818883896, | |
| "rewards/accuracy_reward": 0.20833334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 1410.7708740234375, | |
| "epoch": 0.7361111111111112, | |
| "grad_norm": 1.747154951095581, | |
| "kl": 1.0986328125, | |
| "learning_rate": 2.39194976445643e-06, | |
| "loss": 0.2573, | |
| "reward": 0.22916667722165585, | |
| "reward_std": 0.23507710918784142, | |
| "rewards/accuracy_reward": 0.22916667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 1585.7917175292969, | |
| "epoch": 0.7395833333333334, | |
| "grad_norm": 3.6928560733795166, | |
| "kl": 1.73828125, | |
| "learning_rate": 2.3846192214954763e-06, | |
| "loss": 0.2407, | |
| "reward": 0.18750000558793545, | |
| "reward_std": 0.2446802258491516, | |
| "rewards/accuracy_reward": 0.18750000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 1223.2083740234375, | |
| "epoch": 0.7430555555555556, | |
| "grad_norm": 8.409431457519531, | |
| "kl": 2.0185546875, | |
| "learning_rate": 2.377256140203997e-06, | |
| "loss": 0.4203, | |
| "reward": 0.35416668094694614, | |
| "reward_std": 0.34674229100346565, | |
| "rewards/accuracy_reward": 0.35416668094694614, | |
| "rewards/format_reward": 0.0, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 1047.7500305175781, | |
| "epoch": 0.7465277777777778, | |
| "grad_norm": 6.448366641998291, | |
| "kl": 4.17578125, | |
| "learning_rate": 2.3698607914130642e-06, | |
| "loss": 0.6772, | |
| "reward": 0.5000000204890966, | |
| "reward_std": 0.4326418675482273, | |
| "rewards/accuracy_reward": 0.5000000204890966, | |
| "rewards/format_reward": 0.0, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 1139.7083740234375, | |
| "epoch": 0.75, | |
| "grad_norm": 15.943120002746582, | |
| "kl": 8.171875, | |
| "learning_rate": 2.3624334471406243e-06, | |
| "loss": 0.7659, | |
| "reward": 0.5000000111758709, | |
| "reward_std": 0.30354245379567146, | |
| "rewards/accuracy_reward": 0.5000000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 1072.4791717529297, | |
| "epoch": 0.7534722222222222, | |
| "grad_norm": 13.318893432617188, | |
| "kl": 6.59765625, | |
| "learning_rate": 2.3549743805814904e-06, | |
| "loss": 0.6887, | |
| "reward": 0.4375000074505806, | |
| "reward_std": 0.40168892592191696, | |
| "rewards/accuracy_reward": 0.4375000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 1184.4167022705078, | |
| "epoch": 0.7569444444444444, | |
| "grad_norm": 4.271236896514893, | |
| "kl": 3.3179931640625, | |
| "learning_rate": 2.3474838660972937e-06, | |
| "loss": 0.5353, | |
| "reward": 0.5, | |
| "reward_std": 0.4701542407274246, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 0.0, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 1368.0000305175781, | |
| "epoch": 0.7604166666666666, | |
| "grad_norm": 3.128643751144409, | |
| "kl": 2.61328125, | |
| "learning_rate": 2.3399621792063933e-06, | |
| "loss": 0.211, | |
| "reward": 0.12500000186264515, | |
| "reward_std": 0.23116152361035347, | |
| "rewards/accuracy_reward": 0.12500000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 1186.9791870117188, | |
| "epoch": 0.7638888888888888, | |
| "grad_norm": 1.1380767822265625, | |
| "kl": 0.94921875, | |
| "learning_rate": 2.3324095965737407e-06, | |
| "loss": 0.1204, | |
| "reward": 0.3750000111758709, | |
| "reward_std": 0.20412413775920868, | |
| "rewards/accuracy_reward": 0.3750000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 968.2917175292969, | |
| "epoch": 0.7673611111111112, | |
| "grad_norm": 2.683229923248291, | |
| "kl": 1.0556640625, | |
| "learning_rate": 2.3248263960007045e-06, | |
| "loss": 0.2235, | |
| "reward": 0.35416667722165585, | |
| "reward_std": 0.2350771240890026, | |
| "rewards/accuracy_reward": 0.35416667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 1312.0833740234375, | |
| "epoch": 0.7708333333333334, | |
| "grad_norm": 2.488219738006592, | |
| "kl": 2.0458984375, | |
| "learning_rate": 2.3172128564148506e-06, | |
| "loss": 0.3287, | |
| "reward": 0.43750001303851604, | |
| "reward_std": 0.27258947119116783, | |
| "rewards/accuracy_reward": 0.43750001303851604, | |
| "rewards/format_reward": 0.0, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 1525.5416870117188, | |
| "epoch": 0.7743055555555556, | |
| "grad_norm": 5.634335041046143, | |
| "kl": 2.66015625, | |
| "learning_rate": 2.3095692578596847e-06, | |
| "loss": 0.3212, | |
| "reward": 0.1666666716337204, | |
| "reward_std": 0.19364918768405914, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 1262.6666870117188, | |
| "epoch": 0.7777777777777778, | |
| "grad_norm": 3.449770927429199, | |
| "kl": 1.732421875, | |
| "learning_rate": 2.3018958814843497e-06, | |
| "loss": 0.3507, | |
| "reward": 0.47916666977107525, | |
| "reward_std": 0.299626849591732, | |
| "rewards/accuracy_reward": 0.47916666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 1476.1875305175781, | |
| "epoch": 0.78125, | |
| "grad_norm": 2.946892261505127, | |
| "kl": 2.72265625, | |
| "learning_rate": 2.2941930095332855e-06, | |
| "loss": 0.4074, | |
| "reward": 0.3333333395421505, | |
| "reward_std": 0.3332235924899578, | |
| "rewards/accuracy_reward": 0.3333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 1200.250015258789, | |
| "epoch": 0.7847222222222222, | |
| "grad_norm": 1.4603437185287476, | |
| "kl": 2.17529296875, | |
| "learning_rate": 2.286460925335848e-06, | |
| "loss": 0.2996, | |
| "reward": 0.47916667722165585, | |
| "reward_std": 0.33713917806744576, | |
| "rewards/accuracy_reward": 0.47916667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 1478.4792175292969, | |
| "epoch": 0.7881944444444444, | |
| "grad_norm": 30.39830780029297, | |
| "kl": 5.078125, | |
| "learning_rate": 2.278699913295884e-06, | |
| "loss": 0.439, | |
| "reward": 0.3541666679084301, | |
| "reward_std": 0.2350771203637123, | |
| "rewards/accuracy_reward": 0.3541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 1068.0833435058594, | |
| "epoch": 0.7916666666666666, | |
| "grad_norm": 41.49460983276367, | |
| "kl": 3.273681640625, | |
| "learning_rate": 2.270910258881277e-06, | |
| "loss": 0.2238, | |
| "reward": 0.3958333469927311, | |
| "reward_std": 0.28219256177544594, | |
| "rewards/accuracy_reward": 0.3958333469927311, | |
| "rewards/format_reward": 0.0, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 1184.9792175292969, | |
| "epoch": 0.7951388888888888, | |
| "grad_norm": 54.158878326416016, | |
| "kl": 6.6171875, | |
| "learning_rate": 2.26309224861344e-06, | |
| "loss": 0.5627, | |
| "reward": 0.27083333767950535, | |
| "reward_std": 0.3170611374080181, | |
| "rewards/accuracy_reward": 0.27083333767950535, | |
| "rewards/format_reward": 0.0, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 1613.9583740234375, | |
| "epoch": 0.7986111111111112, | |
| "grad_norm": 25.57980728149414, | |
| "kl": 8.19921875, | |
| "learning_rate": 2.25524617005678e-06, | |
| "loss": 0.5767, | |
| "reward": 0.20833333767950535, | |
| "reward_std": 0.24859581887722015, | |
| "rewards/accuracy_reward": 0.20833333767950535, | |
| "rewards/format_reward": 0.0, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 1398.5417175292969, | |
| "epoch": 0.8020833333333334, | |
| "grad_norm": 30.61398696899414, | |
| "kl": 11.1302490234375, | |
| "learning_rate": 2.247372311808121e-06, | |
| "loss": 0.6159, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 1408.875015258789, | |
| "epoch": 0.8055555555555556, | |
| "grad_norm": 14.65215015411377, | |
| "kl": 10.6015625, | |
| "learning_rate": 2.2394709634860874e-06, | |
| "loss": 0.8554, | |
| "reward": 0.3750000074505806, | |
| "reward_std": 0.32097674161195755, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 1189.0833740234375, | |
| "epoch": 0.8090277777777778, | |
| "grad_norm": 3.0363640785217285, | |
| "kl": 2.8212890625, | |
| "learning_rate": 2.231542415720452e-06, | |
| "loss": 0.3739, | |
| "reward": 0.37500000558793545, | |
| "reward_std": 0.23899271339178085, | |
| "rewards/accuracy_reward": 0.37500000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 869.3750305175781, | |
| "epoch": 0.8125, | |
| "grad_norm": 10.15754508972168, | |
| "kl": 1.951171875, | |
| "learning_rate": 2.2235869601414455e-06, | |
| "loss": 0.4962, | |
| "reward": 0.5625000298023224, | |
| "reward_std": 0.41912319883704185, | |
| "rewards/accuracy_reward": 0.5625000298023224, | |
| "rewards/format_reward": 0.0, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 1577.8333740234375, | |
| "epoch": 0.8159722222222222, | |
| "grad_norm": 5.449507236480713, | |
| "kl": 4.6640625, | |
| "learning_rate": 2.2156048893690305e-06, | |
| "loss": 0.4746, | |
| "reward": 0.27083333767950535, | |
| "reward_std": 0.37465155124664307, | |
| "rewards/accuracy_reward": 0.27083333767950535, | |
| "rewards/format_reward": 0.0, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 1204.2292022705078, | |
| "epoch": 0.8194444444444444, | |
| "grad_norm": 6.545111656188965, | |
| "kl": 2.326171875, | |
| "learning_rate": 2.207596497002137e-06, | |
| "loss": 0.313, | |
| "reward": 0.3750000149011612, | |
| "reward_std": 0.2861081548035145, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 1345.2292175292969, | |
| "epoch": 0.8229166666666666, | |
| "grad_norm": 9.42117691040039, | |
| "kl": 4.671875, | |
| "learning_rate": 2.1995620776078635e-06, | |
| "loss": 0.5092, | |
| "reward": 0.3333333358168602, | |
| "reward_std": 0.4056045264005661, | |
| "rewards/accuracy_reward": 0.3333333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 949.4166870117188, | |
| "epoch": 0.8263888888888888, | |
| "grad_norm": 2.940758466720581, | |
| "kl": 3.52734375, | |
| "learning_rate": 2.1915019267106434e-06, | |
| "loss": 0.5326, | |
| "reward": 0.520833358168602, | |
| "reward_std": 0.40168892219662666, | |
| "rewards/accuracy_reward": 0.520833358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 1049.1458740234375, | |
| "epoch": 0.8298611111111112, | |
| "grad_norm": 1.9600216150283813, | |
| "kl": 4.29327392578125, | |
| "learning_rate": 2.1834163407813736e-06, | |
| "loss": 0.5532, | |
| "reward": 0.5208333432674408, | |
| "reward_std": 0.3170611336827278, | |
| "rewards/accuracy_reward": 0.5208333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 1549.7500305175781, | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 49.90202331542969, | |
| "kl": 4.96484375, | |
| "learning_rate": 2.17530561722651e-06, | |
| "loss": 0.4062, | |
| "reward": 0.1458333395421505, | |
| "reward_std": 0.21764283627271652, | |
| "rewards/accuracy_reward": 0.1458333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 1312.3542175292969, | |
| "epoch": 0.8368055555555556, | |
| "grad_norm": 8.244383811950684, | |
| "kl": 7.55859375, | |
| "learning_rate": 2.167170054377128e-06, | |
| "loss": 0.6962, | |
| "reward": 0.3125000149011612, | |
| "reward_std": 0.36417658627033234, | |
| "rewards/accuracy_reward": 0.3125000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 1401.9167022705078, | |
| "epoch": 0.8402777777777778, | |
| "grad_norm": 9.167287826538086, | |
| "kl": 7.69140625, | |
| "learning_rate": 2.15900995147795e-06, | |
| "loss": 0.5959, | |
| "reward": 0.33333334885537624, | |
| "reward_std": 0.30354243144392967, | |
| "rewards/accuracy_reward": 0.33333334885537624, | |
| "rewards/format_reward": 0.0, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 1388.9583740234375, | |
| "epoch": 0.84375, | |
| "grad_norm": 30.333803176879883, | |
| "kl": 3.76171875, | |
| "learning_rate": 2.150825608676337e-06, | |
| "loss": 0.4013, | |
| "reward": 0.3333333432674408, | |
| "reward_std": 0.2957112528383732, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 1112.5833740234375, | |
| "epoch": 0.8472222222222222, | |
| "grad_norm": 21.359888076782227, | |
| "kl": 0.76806640625, | |
| "learning_rate": 2.14261732701125e-06, | |
| "loss": 0.2214, | |
| "reward": 0.5000000149011612, | |
| "reward_std": 0.2861081548035145, | |
| "rewards/accuracy_reward": 0.5000000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 1473.6458740234375, | |
| "epoch": 0.8506944444444444, | |
| "grad_norm": 91.08158874511719, | |
| "kl": 6.193359375, | |
| "learning_rate": 2.1343854084021774e-06, | |
| "loss": 0.3037, | |
| "reward": 0.2708333432674408, | |
| "reward_std": 0.05103103816509247, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 1369.3542175292969, | |
| "epoch": 0.8541666666666666, | |
| "grad_norm": 23.554519653320312, | |
| "kl": 1.7275390625, | |
| "learning_rate": 2.126130155638026e-06, | |
| "loss": 0.3577, | |
| "reward": 0.3125000074505806, | |
| "reward_std": 0.36417657509446144, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 1561.1458740234375, | |
| "epoch": 0.8576388888888888, | |
| "grad_norm": 39.059391021728516, | |
| "kl": 3.13671875, | |
| "learning_rate": 2.1178518723659894e-06, | |
| "loss": 0.3911, | |
| "reward": 0.3125000074505806, | |
| "reward_std": 0.30922994762659073, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 1173.2083587646484, | |
| "epoch": 0.8611111111111112, | |
| "grad_norm": 37.057369232177734, | |
| "kl": 3.388671875, | |
| "learning_rate": 2.1095508630803744e-06, | |
| "loss": 0.4934, | |
| "reward": 0.33333334513008595, | |
| "reward_std": 0.2861081697046757, | |
| "rewards/accuracy_reward": 0.33333334513008595, | |
| "rewards/format_reward": 0.0, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 1510.7083740234375, | |
| "epoch": 0.8645833333333334, | |
| "grad_norm": 29.06678581237793, | |
| "kl": 8.685546875, | |
| "learning_rate": 2.1012274331114045e-06, | |
| "loss": 0.6475, | |
| "reward": 0.2083333395421505, | |
| "reward_std": 0.23116152733564377, | |
| "rewards/accuracy_reward": 0.2083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 1136.687515258789, | |
| "epoch": 0.8680555555555556, | |
| "grad_norm": 8.243437767028809, | |
| "kl": 8.0859375, | |
| "learning_rate": 2.0928818886139857e-06, | |
| "loss": 0.4485, | |
| "reward": 0.3958333432674408, | |
| "reward_std": 0.35457346588373184, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 1320.9166717529297, | |
| "epoch": 0.8715277777777778, | |
| "grad_norm": 4.048635959625244, | |
| "kl": 2.470703125, | |
| "learning_rate": 2.0845145365564493e-06, | |
| "loss": 0.3813, | |
| "reward": 0.3333333432674408, | |
| "reward_std": 0.3776952549815178, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 1191.6250610351562, | |
| "epoch": 0.875, | |
| "grad_norm": 93.0502700805664, | |
| "kl": 39.8759765625, | |
| "learning_rate": 2.0761256847092566e-06, | |
| "loss": 2.2735, | |
| "reward": 0.520833358168602, | |
| "reward_std": 0.23507710546255112, | |
| "rewards/accuracy_reward": 0.520833358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 1258.0208740234375, | |
| "epoch": 0.8784722222222222, | |
| "grad_norm": 5.269865036010742, | |
| "kl": 1.8564453125, | |
| "learning_rate": 2.0677156416336823e-06, | |
| "loss": 0.45, | |
| "reward": 0.4375000149011612, | |
| "reward_std": 0.36417655646800995, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 1305.8333740234375, | |
| "epoch": 0.8819444444444444, | |
| "grad_norm": 2.876171112060547, | |
| "kl": 2.21728515625, | |
| "learning_rate": 2.059284716670463e-06, | |
| "loss": 0.3218, | |
| "reward": 0.37500000558793545, | |
| "reward_std": 0.23899272084236145, | |
| "rewards/accuracy_reward": 0.37500000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 1217.6667175292969, | |
| "epoch": 0.8854166666666666, | |
| "grad_norm": 8.743562698364258, | |
| "kl": 3.15234375, | |
| "learning_rate": 2.0508332199284182e-06, | |
| "loss": 0.5148, | |
| "reward": 0.479166679084301, | |
| "reward_std": 0.48367295414209366, | |
| "rewards/accuracy_reward": 0.479166679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 1673.6667175292969, | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 14.936572074890137, | |
| "kl": 10.09375, | |
| "learning_rate": 2.0423614622730465e-06, | |
| "loss": 0.6841, | |
| "reward": 0.20833333767950535, | |
| "reward_std": 0.3977733291685581, | |
| "rewards/accuracy_reward": 0.20833333767950535, | |
| "rewards/format_reward": 0.0, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 1166.6042022705078, | |
| "epoch": 0.8923611111111112, | |
| "grad_norm": 6.621070384979248, | |
| "kl": 4.904296875, | |
| "learning_rate": 2.033869755315087e-06, | |
| "loss": 0.3867, | |
| "reward": 0.37500000558793545, | |
| "reward_std": 0.24859581515192986, | |
| "rewards/accuracy_reward": 0.37500000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 1206.8750610351562, | |
| "epoch": 0.8958333333333334, | |
| "grad_norm": 5.69266414642334, | |
| "kl": 4.007080078125, | |
| "learning_rate": 2.025358411399063e-06, | |
| "loss": 0.3224, | |
| "reward": 0.3541666828095913, | |
| "reward_std": 0.27258947119116783, | |
| "rewards/accuracy_reward": 0.3541666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 1439.0208740234375, | |
| "epoch": 0.8993055555555556, | |
| "grad_norm": 3.2223267555236816, | |
| "kl": 2.7587890625, | |
| "learning_rate": 2.016827743591788e-06, | |
| "loss": 0.2154, | |
| "reward": 0.3125000074505806, | |
| "reward_std": 0.317061148583889, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 1229.6250457763672, | |
| "epoch": 0.9027777777777778, | |
| "grad_norm": 20.625864028930664, | |
| "kl": 1.75, | |
| "learning_rate": 2.0082780656708533e-06, | |
| "loss": 0.3499, | |
| "reward": 0.3750000074505806, | |
| "reward_std": 0.377695269882679, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 1381.8750610351562, | |
| "epoch": 0.90625, | |
| "grad_norm": 29.840988159179688, | |
| "kl": 1.0322265625, | |
| "learning_rate": 1.999709692113086e-06, | |
| "loss": 0.3313, | |
| "reward": 0.5000000223517418, | |
| "reward_std": 0.4326418936252594, | |
| "rewards/accuracy_reward": 0.5000000223517418, | |
| "rewards/format_reward": 0.0, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 1118.0208587646484, | |
| "epoch": 0.9097222222222222, | |
| "grad_norm": 7.191032886505127, | |
| "kl": 1.302734375, | |
| "learning_rate": 1.9911229380829837e-06, | |
| "loss": 0.2845, | |
| "reward": 0.4375000186264515, | |
| "reward_std": 0.37377967685461044, | |
| "rewards/accuracy_reward": 0.4375000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 1538.5208740234375, | |
| "epoch": 0.9131944444444444, | |
| "grad_norm": 7.132680892944336, | |
| "kl": 4.359375, | |
| "learning_rate": 1.9825181194211162e-06, | |
| "loss": 0.3229, | |
| "reward": 0.25, | |
| "reward_std": 0.12909945845603943, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 0.0, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 1527.3750610351562, | |
| "epoch": 0.9166666666666666, | |
| "grad_norm": 3.0292775630950928, | |
| "kl": 2.9296875, | |
| "learning_rate": 1.973895552632514e-06, | |
| "loss": 0.3026, | |
| "reward": 0.22916667349636555, | |
| "reward_std": 0.28219256550073624, | |
| "rewards/accuracy_reward": 0.22916667349636555, | |
| "rewards/format_reward": 0.0, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 1328.0625305175781, | |
| "epoch": 0.9201388888888888, | |
| "grad_norm": 4.47227668762207, | |
| "kl": 2.919921875, | |
| "learning_rate": 1.9652555548750265e-06, | |
| "loss": 0.3454, | |
| "reward": 0.2916666753590107, | |
| "reward_std": 0.3410548120737076, | |
| "rewards/accuracy_reward": 0.2916666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 1143.5625457763672, | |
| "epoch": 0.9236111111111112, | |
| "grad_norm": 9.752375602722168, | |
| "kl": 4.634765625, | |
| "learning_rate": 1.95659844394765e-06, | |
| "loss": 0.3936, | |
| "reward": 0.3958333358168602, | |
| "reward_std": 0.38161086291074753, | |
| "rewards/accuracy_reward": 0.3958333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 1455.8958740234375, | |
| "epoch": 0.9270833333333334, | |
| "grad_norm": 3.272082567214966, | |
| "kl": 5.5, | |
| "learning_rate": 1.947924538278847e-06, | |
| "loss": 0.5054, | |
| "reward": 0.291666679084301, | |
| "reward_std": 0.395129531621933, | |
| "rewards/accuracy_reward": 0.291666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 1359.6458587646484, | |
| "epoch": 0.9305555555555556, | |
| "grad_norm": 5.1712446212768555, | |
| "kl": 3.232421875, | |
| "learning_rate": 1.9392341569148255e-06, | |
| "loss": 0.3484, | |
| "reward": 0.25000000186264515, | |
| "reward_std": 0.2686738707125187, | |
| "rewards/accuracy_reward": 0.25000000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 1336.4167175292969, | |
| "epoch": 0.9340277777777778, | |
| "grad_norm": 1.3874348402023315, | |
| "kl": 4.48828125, | |
| "learning_rate": 1.9305276195078106e-06, | |
| "loss": 0.4778, | |
| "reward": 0.3958333432674408, | |
| "reward_std": 0.2621144950389862, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 1308.3333740234375, | |
| "epoch": 0.9375, | |
| "grad_norm": 9.72807788848877, | |
| "kl": 6.0703125, | |
| "learning_rate": 1.921805246304281e-06, | |
| "loss": 0.4963, | |
| "reward": 0.2500000074505806, | |
| "reward_std": 0.24859581142663956, | |
| "rewards/accuracy_reward": 0.2500000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 1447.5625610351562, | |
| "epoch": 0.9409722222222222, | |
| "grad_norm": 8.926982879638672, | |
| "kl": 6.046875, | |
| "learning_rate": 1.913067358133195e-06, | |
| "loss": 0.4169, | |
| "reward": 0.1666666716337204, | |
| "reward_std": 0.20148035883903503, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 1563.000015258789, | |
| "epoch": 0.9444444444444444, | |
| "grad_norm": 6.924929141998291, | |
| "kl": 4.8828125, | |
| "learning_rate": 1.904314276394185e-06, | |
| "loss": 0.3244, | |
| "reward": 0.14583333395421505, | |
| "reward_std": 0.1801304966211319, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 1408.2708435058594, | |
| "epoch": 0.9479166666666666, | |
| "grad_norm": 3.0602502822875977, | |
| "kl": 2.9296875, | |
| "learning_rate": 1.895546323045739e-06, | |
| "loss": 0.4048, | |
| "reward": 0.291666679084301, | |
| "reward_std": 0.32097672671079636, | |
| "rewards/accuracy_reward": 0.291666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 1077.812515258789, | |
| "epoch": 0.9513888888888888, | |
| "grad_norm": 4.916645050048828, | |
| "kl": 1.953125, | |
| "learning_rate": 1.8867638205933569e-06, | |
| "loss": 0.233, | |
| "reward": 0.3750000074505806, | |
| "reward_std": 0.3977733254432678, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 1295.2083740234375, | |
| "epoch": 0.9548611111111112, | |
| "grad_norm": 4.2872538566589355, | |
| "kl": 1.2802734375, | |
| "learning_rate": 1.8779670920776879e-06, | |
| "loss": 0.2287, | |
| "reward": 0.25, | |
| "reward_std": 0.31314554065465927, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 0.0, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 1236.3750457763672, | |
| "epoch": 0.9583333333333334, | |
| "grad_norm": 7.937647342681885, | |
| "kl": 1.951171875, | |
| "learning_rate": 1.8691564610626482e-06, | |
| "loss": 0.4362, | |
| "reward": 0.4583333544433117, | |
| "reward_std": 0.4248107075691223, | |
| "rewards/accuracy_reward": 0.4583333544433117, | |
| "rewards/format_reward": 0.0, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 1691.3333435058594, | |
| "epoch": 0.9618055555555556, | |
| "grad_norm": 1.483902096748352, | |
| "kl": 2.9921875, | |
| "learning_rate": 1.8603322516235209e-06, | |
| "loss": 0.328, | |
| "reward": 0.2291666753590107, | |
| "reward_std": 0.2446802258491516, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 1274.604248046875, | |
| "epoch": 0.9652777777777778, | |
| "grad_norm": 4.791306018829346, | |
| "kl": 2.279296875, | |
| "learning_rate": 1.8514947883350336e-06, | |
| "loss": 0.311, | |
| "reward": 0.37500002048909664, | |
| "reward_std": 0.2861081510782242, | |
| "rewards/accuracy_reward": 0.37500002048909664, | |
| "rewards/format_reward": 0.0, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 1250.8958435058594, | |
| "epoch": 0.96875, | |
| "grad_norm": 1.8412401676177979, | |
| "kl": 4.4765625, | |
| "learning_rate": 1.842644396259422e-06, | |
| "loss": 0.4648, | |
| "reward": 0.3125000074505806, | |
| "reward_std": 0.2525114044547081, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 1567.4166870117188, | |
| "epoch": 0.9722222222222222, | |
| "grad_norm": 4.421205043792725, | |
| "kl": 7.1640625, | |
| "learning_rate": 1.8337814009344715e-06, | |
| "loss": 0.6057, | |
| "reward": 0.2083333358168602, | |
| "reward_std": 0.2957112565636635, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 1520.9166870117188, | |
| "epoch": 0.9756944444444444, | |
| "grad_norm": 40.64996337890625, | |
| "kl": 9.2421875, | |
| "learning_rate": 1.824906128361545e-06, | |
| "loss": 0.7715, | |
| "reward": 0.25000000558793545, | |
| "reward_std": 0.4152075909078121, | |
| "rewards/accuracy_reward": 0.25000000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 1281.2500457763672, | |
| "epoch": 0.9791666666666666, | |
| "grad_norm": 8.213888168334961, | |
| "kl": 6.15625, | |
| "learning_rate": 1.8160189049935894e-06, | |
| "loss": 0.4568, | |
| "reward": 0.291666679084301, | |
| "reward_std": 0.23116153106093407, | |
| "rewards/accuracy_reward": 0.291666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 1035.2083587646484, | |
| "epoch": 0.9826388888888888, | |
| "grad_norm": 2.28889536857605, | |
| "kl": 2.1376953125, | |
| "learning_rate": 1.807120057723131e-06, | |
| "loss": 0.3556, | |
| "reward": 0.5208333507180214, | |
| "reward_std": 0.3720077723264694, | |
| "rewards/accuracy_reward": 0.5208333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 1216.7708740234375, | |
| "epoch": 0.9861111111111112, | |
| "grad_norm": 3.839035987854004, | |
| "kl": 2.34765625, | |
| "learning_rate": 1.7982099138702485e-06, | |
| "loss": 0.3082, | |
| "reward": 0.3125000111758709, | |
| "reward_std": 0.27258947864174843, | |
| "rewards/accuracy_reward": 0.3125000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 1406.0625610351562, | |
| "epoch": 0.9895833333333334, | |
| "grad_norm": 0.8240854740142822, | |
| "kl": 2.9296875, | |
| "learning_rate": 1.789288801170536e-06, | |
| "loss": 0.2752, | |
| "reward": 0.3541666828095913, | |
| "reward_std": 0.28219256177544594, | |
| "rewards/accuracy_reward": 0.3541666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 1407.5208740234375, | |
| "epoch": 0.9930555555555556, | |
| "grad_norm": 4.502129077911377, | |
| "kl": 3.7421875, | |
| "learning_rate": 1.7803570477630467e-06, | |
| "loss": 0.3368, | |
| "reward": 0.1666666716337204, | |
| "reward_std": 0.23899273574352264, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 1468.7292175292969, | |
| "epoch": 0.9965277777777778, | |
| "grad_norm": 2.8554723262786865, | |
| "kl": 3.60546875, | |
| "learning_rate": 1.7714149821782227e-06, | |
| "loss": 0.4355, | |
| "reward": 0.3333333395421505, | |
| "reward_std": 0.2861081548035145, | |
| "rewards/accuracy_reward": 0.3333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 1393.6458740234375, | |
| "epoch": 1.0, | |
| "grad_norm": 5.764129161834717, | |
| "kl": 2.7734375, | |
| "learning_rate": 1.762462933325813e-06, | |
| "loss": 0.4379, | |
| "reward": 0.3958333432674408, | |
| "reward_std": 0.36417658627033234, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_completion_length": 1429.0366698665562, | |
| "eval_kl": 4.87083458108124, | |
| "eval_loss": 0.4219720959663391, | |
| "eval_reward": 0.25629406557432477, | |
| "eval_reward_std": 0.2712749954751971, | |
| "eval_rewards/accuracy_reward": 0.25629406557432477, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 18857.5739, | |
| "eval_samples_per_second": 0.07, | |
| "eval_steps_per_second": 0.006, | |
| "step": 288 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 576, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |