| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 161, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 77.421875, | |
| "epoch": 0.006211180124223602, | |
| "grad_norm": 4.007043838500977, | |
| "kl": 0.0, | |
| "learning_rate": 9.937888198757763e-07, | |
| "loss": -0.0, | |
| "reward": 1.46875, | |
| "reward_mean": 1.46875, | |
| "reward_std": 0.23356688022613525, | |
| "rewards/accuracy_reward": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "step": 1 | |
| }, | |
| { | |
| "advantages": 1.30385160446167e-08, | |
| "completion_length": 78.921875, | |
| "epoch": 0.012422360248447204, | |
| "grad_norm": 7.15122652053833, | |
| "kl": 0.00041961669921875, | |
| "learning_rate": 9.875776397515528e-07, | |
| "loss": 0.0, | |
| "reward": 1.53125, | |
| "reward_mean": 1.53125, | |
| "reward_std": 0.2845909595489502, | |
| "rewards/accuracy_reward": 0.53125, | |
| "rewards/format_reward": 1.0, | |
| "step": 2 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 78.671875, | |
| "epoch": 0.018633540372670808, | |
| "grad_norm": 2.9171459674835205, | |
| "kl": 0.000392913818359375, | |
| "learning_rate": 9.813664596273291e-07, | |
| "loss": 0.0, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.1552036553621292, | |
| "rewards/accuracy_reward": 0.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 3 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 77.78125, | |
| "epoch": 0.024844720496894408, | |
| "grad_norm": 5.474589824676514, | |
| "kl": 0.000759124755859375, | |
| "learning_rate": 9.751552795031055e-07, | |
| "loss": 0.0001, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.213067427277565, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 4 | |
| }, | |
| { | |
| "advantages": -2.7939677238464355e-09, | |
| "completion_length": 90.8125, | |
| "epoch": 0.031055900621118012, | |
| "grad_norm": 5.2480363845825195, | |
| "kl": 0.001495361328125, | |
| "learning_rate": 9.68944099378882e-07, | |
| "loss": 0.0001, | |
| "reward": 1.53125, | |
| "reward_mean": 1.53125, | |
| "reward_std": 0.17570313811302185, | |
| "rewards/accuracy_reward": 0.53125, | |
| "rewards/format_reward": 1.0, | |
| "step": 5 | |
| }, | |
| { | |
| "advantages": 6.51925802230835e-09, | |
| "completion_length": 81.203125, | |
| "epoch": 0.037267080745341616, | |
| "grad_norm": 8.329508781433105, | |
| "kl": 0.006256103515625, | |
| "learning_rate": 9.627329192546583e-07, | |
| "loss": 0.0006, | |
| "reward": 1.375, | |
| "reward_mean": 1.375, | |
| "reward_std": 0.26409146189689636, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 1.0, | |
| "step": 6 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 84.265625, | |
| "epoch": 0.043478260869565216, | |
| "grad_norm": 9.400680541992188, | |
| "kl": 0.0106201171875, | |
| "learning_rate": 9.565217391304349e-07, | |
| "loss": 0.0011, | |
| "reward": 1.65625, | |
| "reward_mean": 1.65625, | |
| "reward_std": 0.2404065877199173, | |
| "rewards/accuracy_reward": 0.65625, | |
| "rewards/format_reward": 1.0, | |
| "step": 7 | |
| }, | |
| { | |
| "advantages": -2.7939677238464355e-09, | |
| "completion_length": 84.109375, | |
| "epoch": 0.049689440993788817, | |
| "grad_norm": 6.011223316192627, | |
| "kl": 0.0057373046875, | |
| "learning_rate": 9.503105590062112e-07, | |
| "loss": 0.0006, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.22461533546447754, | |
| "rewards/accuracy_reward": 0.453125, | |
| "rewards/format_reward": 0.984375, | |
| "step": 8 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 87.046875, | |
| "epoch": 0.055900621118012424, | |
| "grad_norm": 4.103212356567383, | |
| "kl": 0.00244140625, | |
| "learning_rate": 9.440993788819875e-07, | |
| "loss": 0.0002, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.0578637570142746, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 9 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 76.15625, | |
| "epoch": 0.062111801242236024, | |
| "grad_norm": 7.4132466316223145, | |
| "kl": 0.01287841796875, | |
| "learning_rate": 9.37888198757764e-07, | |
| "loss": 0.0013, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.3335031569004059, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 1.0, | |
| "step": 10 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 76.75, | |
| "epoch": 0.06832298136645963, | |
| "grad_norm": 4.6751017570495605, | |
| "kl": 0.0130615234375, | |
| "learning_rate": 9.316770186335403e-07, | |
| "loss": 0.0013, | |
| "reward": 1.28125, | |
| "reward_mean": 1.28125, | |
| "reward_std": 0.0578637570142746, | |
| "rewards/accuracy_reward": 0.28125, | |
| "rewards/format_reward": 1.0, | |
| "step": 11 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 76.390625, | |
| "epoch": 0.07453416149068323, | |
| "grad_norm": 7.682182788848877, | |
| "kl": 0.018310546875, | |
| "learning_rate": 9.254658385093167e-07, | |
| "loss": 0.0018, | |
| "reward": 1.734375, | |
| "reward_mean": 1.734375, | |
| "reward_std": 0.15992169082164764, | |
| "rewards/accuracy_reward": 0.734375, | |
| "rewards/format_reward": 1.0, | |
| "step": 12 | |
| }, | |
| { | |
| "advantages": -2.7939677238464355e-09, | |
| "completion_length": 85.234375, | |
| "epoch": 0.08074534161490683, | |
| "grad_norm": 4.814305305480957, | |
| "kl": 0.00396728515625, | |
| "learning_rate": 9.19254658385093e-07, | |
| "loss": 0.0004, | |
| "reward": 1.8125, | |
| "reward_mean": 1.8125, | |
| "reward_std": 0.22461533546447754, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 1.0, | |
| "step": 13 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 86.3125, | |
| "epoch": 0.08695652173913043, | |
| "grad_norm": 189.3062744140625, | |
| "kl": 0.033203125, | |
| "learning_rate": 9.130434782608695e-07, | |
| "loss": 0.0033, | |
| "reward": 1.609375, | |
| "reward_mean": 1.609375, | |
| "reward_std": 0.19044628739356995, | |
| "rewards/accuracy_reward": 0.609375, | |
| "rewards/format_reward": 1.0, | |
| "step": 14 | |
| }, | |
| { | |
| "advantages": 2.7939677238464355e-09, | |
| "completion_length": 69.8125, | |
| "epoch": 0.09316770186335403, | |
| "grad_norm": 3.5520412921905518, | |
| "kl": 0.01129150390625, | |
| "learning_rate": 9.06832298136646e-07, | |
| "loss": 0.0011, | |
| "reward": 1.78125, | |
| "reward_mean": 1.78125, | |
| "reward_std": 0.10888782143592834, | |
| "rewards/accuracy_reward": 0.78125, | |
| "rewards/format_reward": 1.0, | |
| "step": 15 | |
| }, | |
| { | |
| "advantages": -9.313225746154785e-10, | |
| "completion_length": 76.296875, | |
| "epoch": 0.09937888198757763, | |
| "grad_norm": 3.526542901992798, | |
| "kl": 0.00909423828125, | |
| "learning_rate": 9.006211180124223e-07, | |
| "loss": 0.0009, | |
| "reward": 1.609375, | |
| "reward_mean": 1.609375, | |
| "reward_std": 0.12255740165710449, | |
| "rewards/accuracy_reward": 0.609375, | |
| "rewards/format_reward": 1.0, | |
| "step": 16 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 86.3125, | |
| "epoch": 0.10559006211180125, | |
| "grad_norm": 5.98048210144043, | |
| "kl": 0.0057373046875, | |
| "learning_rate": 8.944099378881988e-07, | |
| "loss": 0.0006, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.2041158676147461, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 17 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 82.921875, | |
| "epoch": 0.11180124223602485, | |
| "grad_norm": 2.0705599784851074, | |
| "kl": 0.00592041015625, | |
| "learning_rate": 8.881987577639751e-07, | |
| "loss": 0.0006, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.06681530922651291, | |
| "rewards/accuracy_reward": 0.578125, | |
| "rewards/format_reward": 0.984375, | |
| "step": 18 | |
| }, | |
| { | |
| "advantages": -2.7939677238464355e-09, | |
| "completion_length": 81.40625, | |
| "epoch": 0.11801242236024845, | |
| "grad_norm": 9.266715049743652, | |
| "kl": 0.0079345703125, | |
| "learning_rate": 8.819875776397515e-07, | |
| "loss": 0.0008, | |
| "reward": 1.546875, | |
| "reward_mean": 1.546875, | |
| "reward_std": 0.2109457552433014, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 0.984375, | |
| "step": 19 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 80.203125, | |
| "epoch": 0.12422360248447205, | |
| "grad_norm": 10.367863655090332, | |
| "kl": 0.0072021484375, | |
| "learning_rate": 8.757763975155279e-07, | |
| "loss": 0.0007, | |
| "reward": 1.40625, | |
| "reward_mean": 1.40625, | |
| "reward_std": 0.2404065728187561, | |
| "rewards/accuracy_reward": 0.40625, | |
| "rewards/format_reward": 1.0, | |
| "step": 20 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 75.734375, | |
| "epoch": 0.13043478260869565, | |
| "grad_norm": 2.6553478240966797, | |
| "kl": 0.00592041015625, | |
| "learning_rate": 8.695652173913043e-07, | |
| "loss": 0.0006, | |
| "reward": 1.578125, | |
| "reward_mean": 1.578125, | |
| "reward_std": 0.10205793380737305, | |
| "rewards/accuracy_reward": 0.578125, | |
| "rewards/format_reward": 1.0, | |
| "step": 21 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 85.8125, | |
| "epoch": 0.13664596273291926, | |
| "grad_norm": 3.458266496658325, | |
| "kl": 0.00604248046875, | |
| "learning_rate": 8.633540372670807e-07, | |
| "loss": 0.0006, | |
| "reward": 1.515625, | |
| "reward_mean": 1.515625, | |
| "reward_std": 0.15981829166412354, | |
| "rewards/accuracy_reward": 0.53125, | |
| "rewards/format_reward": 0.984375, | |
| "step": 22 | |
| }, | |
| { | |
| "advantages": -2.7939677238464355e-09, | |
| "completion_length": 79.328125, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 3.2002384662628174, | |
| "kl": 0.00543212890625, | |
| "learning_rate": 8.57142857142857e-07, | |
| "loss": 0.0005, | |
| "reward": 1.671875, | |
| "reward_mean": 1.671875, | |
| "reward_std": 0.2109457552433014, | |
| "rewards/accuracy_reward": 0.671875, | |
| "rewards/format_reward": 1.0, | |
| "step": 23 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 75.375, | |
| "epoch": 0.14906832298136646, | |
| "grad_norm": 5.946903705596924, | |
| "kl": 0.0087890625, | |
| "learning_rate": 8.509316770186336e-07, | |
| "loss": 0.0009, | |
| "reward": 1.484375, | |
| "reward_mean": 1.484375, | |
| "reward_std": 0.19044628739356995, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 0.984375, | |
| "step": 24 | |
| }, | |
| { | |
| "advantages": -9.313225746154785e-10, | |
| "completion_length": 68.8125, | |
| "epoch": 0.15527950310559005, | |
| "grad_norm": 4.977855682373047, | |
| "kl": 0.008544921875, | |
| "learning_rate": 8.447204968944099e-07, | |
| "loss": 0.0009, | |
| "reward": 1.734375, | |
| "reward_mean": 1.734375, | |
| "reward_std": 0.12255740165710449, | |
| "rewards/accuracy_reward": 0.734375, | |
| "rewards/format_reward": 1.0, | |
| "step": 25 | |
| }, | |
| { | |
| "advantages": -8.381903171539307e-09, | |
| "completion_length": 83.59375, | |
| "epoch": 0.16149068322981366, | |
| "grad_norm": 4.409206390380859, | |
| "kl": 0.01239013671875, | |
| "learning_rate": 8.385093167701863e-07, | |
| "loss": 0.0012, | |
| "reward": 1.609375, | |
| "reward_mean": 1.609375, | |
| "reward_std": 0.2198973000049591, | |
| "rewards/accuracy_reward": 0.609375, | |
| "rewards/format_reward": 1.0, | |
| "step": 26 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 78.109375, | |
| "epoch": 0.16770186335403728, | |
| "grad_norm": 3.1185989379882812, | |
| "kl": 0.006072998046875, | |
| "learning_rate": 8.322981366459628e-07, | |
| "loss": 0.0006, | |
| "reward": 1.65625, | |
| "reward_mean": 1.65625, | |
| "reward_std": 0.10888782143592834, | |
| "rewards/accuracy_reward": 0.671875, | |
| "rewards/format_reward": 0.984375, | |
| "step": 27 | |
| }, | |
| { | |
| "advantages": 4.6566128730773926e-09, | |
| "completion_length": 72.0625, | |
| "epoch": 0.17391304347826086, | |
| "grad_norm": 4.9565935134887695, | |
| "kl": 0.010009765625, | |
| "learning_rate": 8.260869565217391e-07, | |
| "loss": 0.001, | |
| "reward": 1.34375, | |
| "reward_mean": 1.34375, | |
| "reward_std": 0.16675157845020294, | |
| "rewards/accuracy_reward": 0.34375, | |
| "rewards/format_reward": 1.0, | |
| "step": 28 | |
| }, | |
| { | |
| "advantages": -4.6566128730773926e-09, | |
| "completion_length": 84.953125, | |
| "epoch": 0.18012422360248448, | |
| "grad_norm": 4.35609769821167, | |
| "kl": 0.011962890625, | |
| "learning_rate": 8.198757763975155e-07, | |
| "loss": 0.0012, | |
| "reward": 1.46875, | |
| "reward_mean": 1.46875, | |
| "reward_std": 0.25513991713523865, | |
| "rewards/accuracy_reward": 0.484375, | |
| "rewards/format_reward": 0.984375, | |
| "step": 29 | |
| }, | |
| { | |
| "advantages": -9.313225746154785e-10, | |
| "completion_length": 85.34375, | |
| "epoch": 0.18633540372670807, | |
| "grad_norm": 5.767938137054443, | |
| "kl": 0.009033203125, | |
| "learning_rate": 8.136645962732918e-07, | |
| "loss": 0.0009, | |
| "reward": 1.609375, | |
| "reward_mean": 1.609375, | |
| "reward_std": 0.1530819982290268, | |
| "rewards/accuracy_reward": 0.609375, | |
| "rewards/format_reward": 1.0, | |
| "step": 30 | |
| }, | |
| { | |
| "advantages": -5.587935447692871e-09, | |
| "completion_length": 83.5625, | |
| "epoch": 0.19254658385093168, | |
| "grad_norm": 49.12059783935547, | |
| "kl": 0.0091552734375, | |
| "learning_rate": 8.074534161490683e-07, | |
| "loss": 0.0009, | |
| "reward": 1.578125, | |
| "reward_mean": 1.578125, | |
| "reward_std": 0.10205793380737305, | |
| "rewards/accuracy_reward": 0.578125, | |
| "rewards/format_reward": 1.0, | |
| "step": 31 | |
| }, | |
| { | |
| "advantages": 4.6566128730773926e-09, | |
| "completion_length": 77.734375, | |
| "epoch": 0.19875776397515527, | |
| "grad_norm": 1.4828208684921265, | |
| "kl": 0.00970458984375, | |
| "learning_rate": 8.012422360248446e-07, | |
| "loss": 0.001, | |
| "reward": 1.421875, | |
| "reward_mean": 1.421875, | |
| "reward_std": 0.0646936446428299, | |
| "rewards/accuracy_reward": 0.421875, | |
| "rewards/format_reward": 1.0, | |
| "step": 32 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 78.453125, | |
| "epoch": 0.20496894409937888, | |
| "grad_norm": 7.876468658447266, | |
| "kl": 0.020263671875, | |
| "learning_rate": 7.95031055900621e-07, | |
| "loss": 0.002, | |
| "reward": 1.734375, | |
| "reward_mean": 1.734375, | |
| "reward_std": 0.2109457552433014, | |
| "rewards/accuracy_reward": 0.734375, | |
| "rewards/format_reward": 1.0, | |
| "step": 33 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 80.5, | |
| "epoch": 0.2111801242236025, | |
| "grad_norm": 3.8213541507720947, | |
| "kl": 0.01361083984375, | |
| "learning_rate": 7.888198757763976e-07, | |
| "loss": 0.0014, | |
| "reward": 1.46875, | |
| "reward_mean": 1.46875, | |
| "reward_std": 0.0578637570142746, | |
| "rewards/accuracy_reward": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "step": 34 | |
| }, | |
| { | |
| "advantages": -6.51925802230835e-09, | |
| "completion_length": 89.5625, | |
| "epoch": 0.21739130434782608, | |
| "grad_norm": 3.453101634979248, | |
| "kl": 0.01470947265625, | |
| "learning_rate": 7.826086956521739e-07, | |
| "loss": 0.0015, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.17570312321186066, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 35 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 90.28125, | |
| "epoch": 0.2236024844720497, | |
| "grad_norm": 2.642101526260376, | |
| "kl": 0.0107421875, | |
| "learning_rate": 7.763975155279503e-07, | |
| "loss": 0.0011, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.06681530922651291, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 36 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 80.3125, | |
| "epoch": 0.22981366459627328, | |
| "grad_norm": 3.5424673557281494, | |
| "kl": 0.01239013671875, | |
| "learning_rate": 7.701863354037266e-07, | |
| "loss": 0.0012, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.0578637570142746, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 37 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 84.140625, | |
| "epoch": 0.2360248447204969, | |
| "grad_norm": 0.38800248503685, | |
| "kl": 0.01275634765625, | |
| "learning_rate": 7.639751552795031e-07, | |
| "loss": 0.0013, | |
| "reward": 1.375, | |
| "reward_mean": 1.375, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.390625, | |
| "rewards/format_reward": 0.984375, | |
| "step": 38 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 89.109375, | |
| "epoch": 0.2422360248447205, | |
| "grad_norm": 2.645474433898926, | |
| "kl": 0.01397705078125, | |
| "learning_rate": 7.577639751552795e-07, | |
| "loss": 0.0014, | |
| "reward": 1.515625, | |
| "reward_mean": 1.515625, | |
| "reward_std": 0.04419417306780815, | |
| "rewards/accuracy_reward": 0.515625, | |
| "rewards/format_reward": 1.0, | |
| "step": 39 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 72.296875, | |
| "epoch": 0.2484472049689441, | |
| "grad_norm": 8.574762344360352, | |
| "kl": 0.0159912109375, | |
| "learning_rate": 7.515527950310558e-07, | |
| "loss": 0.0016, | |
| "reward": 1.671875, | |
| "reward_mean": 1.671875, | |
| "reward_std": 0.23144522309303284, | |
| "rewards/accuracy_reward": 0.671875, | |
| "rewards/format_reward": 1.0, | |
| "step": 40 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 86.046875, | |
| "epoch": 0.2546583850931677, | |
| "grad_norm": 36.26329040527344, | |
| "kl": 0.0147705078125, | |
| "learning_rate": 7.453416149068323e-07, | |
| "loss": 0.0015, | |
| "reward": 1.65625, | |
| "reward_mean": 1.65625, | |
| "reward_std": 0.23356688022613525, | |
| "rewards/accuracy_reward": 0.65625, | |
| "rewards/format_reward": 1.0, | |
| "step": 41 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 77.0625, | |
| "epoch": 0.2608695652173913, | |
| "grad_norm": 10.992830276489258, | |
| "kl": 0.0113525390625, | |
| "learning_rate": 7.391304347826086e-07, | |
| "loss": 0.0011, | |
| "reward": 1.703125, | |
| "reward_mean": 1.703125, | |
| "reward_std": 0.24464011192321777, | |
| "rewards/accuracy_reward": 0.703125, | |
| "rewards/format_reward": 1.0, | |
| "step": 42 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 86.53125, | |
| "epoch": 0.2670807453416149, | |
| "grad_norm": 6.251725673675537, | |
| "kl": 0.009033203125, | |
| "learning_rate": 7.329192546583851e-07, | |
| "loss": 0.0009, | |
| "reward": 1.609375, | |
| "reward_mean": 1.609375, | |
| "reward_std": 0.23144522309303284, | |
| "rewards/accuracy_reward": 0.609375, | |
| "rewards/format_reward": 1.0, | |
| "step": 43 | |
| }, | |
| { | |
| "advantages": -4.6566128730773926e-09, | |
| "completion_length": 86.4375, | |
| "epoch": 0.2732919254658385, | |
| "grad_norm": 3.8048486709594727, | |
| "kl": 0.01385498046875, | |
| "learning_rate": 7.267080745341615e-07, | |
| "loss": 0.0014, | |
| "reward": 1.765625, | |
| "reward_mean": 1.765625, | |
| "reward_std": 0.17358146607875824, | |
| "rewards/accuracy_reward": 0.765625, | |
| "rewards/format_reward": 1.0, | |
| "step": 44 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 84.21875, | |
| "epoch": 0.2795031055900621, | |
| "grad_norm": 2.5062499046325684, | |
| "kl": 0.00811767578125, | |
| "learning_rate": 7.204968944099379e-07, | |
| "loss": 0.0008, | |
| "reward": 1.796875, | |
| "reward_mean": 1.796875, | |
| "reward_std": 0.11100947856903076, | |
| "rewards/accuracy_reward": 0.8125, | |
| "rewards/format_reward": 0.984375, | |
| "step": 45 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 77.9375, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 3.8415560722351074, | |
| "kl": 0.01165771484375, | |
| "learning_rate": 7.142857142857143e-07, | |
| "loss": 0.0012, | |
| "reward": 1.53125, | |
| "reward_mean": 1.53125, | |
| "reward_std": 0.1462521106004715, | |
| "rewards/accuracy_reward": 0.53125, | |
| "rewards/format_reward": 1.0, | |
| "step": 46 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 82.6875, | |
| "epoch": 0.2919254658385093, | |
| "grad_norm": 2.903069496154785, | |
| "kl": 0.012451171875, | |
| "learning_rate": 7.080745341614906e-07, | |
| "loss": 0.0012, | |
| "reward": 1.578125, | |
| "reward_mean": 1.578125, | |
| "reward_std": 0.11100947856903076, | |
| "rewards/accuracy_reward": 0.59375, | |
| "rewards/format_reward": 0.984375, | |
| "step": 47 | |
| }, | |
| { | |
| "advantages": -2.7939677238464355e-09, | |
| "completion_length": 75.578125, | |
| "epoch": 0.2981366459627329, | |
| "grad_norm": 11.884781837463379, | |
| "kl": 0.0125732421875, | |
| "learning_rate": 7.018633540372671e-07, | |
| "loss": 0.0013, | |
| "reward": 1.65625, | |
| "reward_mean": 1.65625, | |
| "reward_std": 0.17570312321186066, | |
| "rewards/accuracy_reward": 0.65625, | |
| "rewards/format_reward": 1.0, | |
| "step": 48 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 73.34375, | |
| "epoch": 0.30434782608695654, | |
| "grad_norm": 2.234876871109009, | |
| "kl": 0.0084228515625, | |
| "learning_rate": 6.956521739130434e-07, | |
| "loss": 0.0008, | |
| "reward": 1.484375, | |
| "reward_mean": 1.484375, | |
| "reward_std": 0.04419417306780815, | |
| "rewards/accuracy_reward": 0.484375, | |
| "rewards/format_reward": 1.0, | |
| "step": 49 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 81.8125, | |
| "epoch": 0.3105590062111801, | |
| "grad_norm": 4.401739597320557, | |
| "kl": 0.007232666015625, | |
| "learning_rate": 6.894409937888198e-07, | |
| "loss": 0.0007, | |
| "reward": 1.765625, | |
| "reward_mean": 1.765625, | |
| "reward_std": 0.17782479524612427, | |
| "rewards/accuracy_reward": 0.765625, | |
| "rewards/format_reward": 1.0, | |
| "step": 50 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 84.015625, | |
| "epoch": 0.3167701863354037, | |
| "grad_norm": 0.28293830156326294, | |
| "kl": 0.0062255859375, | |
| "learning_rate": 6.832298136645962e-07, | |
| "loss": 0.0006, | |
| "reward": 2.0, | |
| "reward_mean": 2.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 1.0, | |
| "rewards/format_reward": 1.0, | |
| "step": 51 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 79.125, | |
| "epoch": 0.32298136645962733, | |
| "grad_norm": 2.2039246559143066, | |
| "kl": 0.0106201171875, | |
| "learning_rate": 6.770186335403726e-07, | |
| "loss": 0.0011, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 52 | |
| }, | |
| { | |
| "advantages": 9.313225746154785e-10, | |
| "completion_length": 76.0625, | |
| "epoch": 0.32919254658385094, | |
| "grad_norm": 4.176709175109863, | |
| "kl": 0.01123046875, | |
| "learning_rate": 6.708074534161491e-07, | |
| "loss": 0.0011, | |
| "reward": 1.640625, | |
| "reward_mean": 1.640625, | |
| "reward_std": 0.1530819982290268, | |
| "rewards/accuracy_reward": 0.640625, | |
| "rewards/format_reward": 1.0, | |
| "step": 53 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 81.125, | |
| "epoch": 0.33540372670807456, | |
| "grad_norm": 30.12848663330078, | |
| "kl": 0.099609375, | |
| "learning_rate": 6.645962732919254e-07, | |
| "loss": 0.01, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.1462520956993103, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 54 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 80.609375, | |
| "epoch": 0.3416149068322981, | |
| "grad_norm": 12.808406829833984, | |
| "kl": 0.01416015625, | |
| "learning_rate": 6.583850931677019e-07, | |
| "loss": 0.0014, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.2238783985376358, | |
| "rewards/accuracy_reward": 0.703125, | |
| "rewards/format_reward": 0.984375, | |
| "step": 55 | |
| }, | |
| { | |
| "advantages": -5.587935447692871e-09, | |
| "completion_length": 76.15625, | |
| "epoch": 0.34782608695652173, | |
| "grad_norm": 5.750046253204346, | |
| "kl": 0.01019287109375, | |
| "learning_rate": 6.521739130434782e-07, | |
| "loss": 0.001, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.2041158676147461, | |
| "rewards/accuracy_reward": 0.515625, | |
| "rewards/format_reward": 0.984375, | |
| "step": 56 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 76.765625, | |
| "epoch": 0.35403726708074534, | |
| "grad_norm": 4.7853102684021, | |
| "kl": 0.010986328125, | |
| "learning_rate": 6.459627329192546e-07, | |
| "loss": 0.0011, | |
| "reward": 1.328125, | |
| "reward_mean": 1.328125, | |
| "reward_std": 0.19044628739356995, | |
| "rewards/accuracy_reward": 0.34375, | |
| "rewards/format_reward": 0.984375, | |
| "step": 57 | |
| }, | |
| { | |
| "advantages": 4.6566128730773926e-09, | |
| "completion_length": 88.796875, | |
| "epoch": 0.36024844720496896, | |
| "grad_norm": 1.7344197034835815, | |
| "kl": 0.00982666015625, | |
| "learning_rate": 6.39751552795031e-07, | |
| "loss": 0.001, | |
| "reward": 1.671875, | |
| "reward_mean": 1.671875, | |
| "reward_std": 0.0646936446428299, | |
| "rewards/accuracy_reward": 0.671875, | |
| "rewards/format_reward": 1.0, | |
| "step": 58 | |
| }, | |
| { | |
| "advantages": -9.313225746154785e-09, | |
| "completion_length": 84.28125, | |
| "epoch": 0.36645962732919257, | |
| "grad_norm": 3.1260499954223633, | |
| "kl": 0.01416015625, | |
| "learning_rate": 6.335403726708074e-07, | |
| "loss": 0.0014, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.1828794628381729, | |
| "rewards/accuracy_reward": 0.703125, | |
| "rewards/format_reward": 0.984375, | |
| "step": 59 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 77.21875, | |
| "epoch": 0.37267080745341613, | |
| "grad_norm": 1.7963190078735352, | |
| "kl": 0.0089111328125, | |
| "learning_rate": 6.273291925465838e-07, | |
| "loss": 0.0009, | |
| "reward": 1.84375, | |
| "reward_mean": 1.84375, | |
| "reward_std": 0.0578637570142746, | |
| "rewards/accuracy_reward": 0.84375, | |
| "rewards/format_reward": 1.0, | |
| "step": 60 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 82.25, | |
| "epoch": 0.37888198757763975, | |
| "grad_norm": 2.5049538612365723, | |
| "kl": 0.00787353515625, | |
| "learning_rate": 6.211180124223601e-07, | |
| "loss": 0.0008, | |
| "reward": 1.625, | |
| "reward_mean": 1.625, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.625, | |
| "rewards/format_reward": 1.0, | |
| "step": 61 | |
| }, | |
| { | |
| "advantages": -1.210719347000122e-08, | |
| "completion_length": 80.375, | |
| "epoch": 0.38509316770186336, | |
| "grad_norm": 5.739541530609131, | |
| "kl": 0.01312255859375, | |
| "learning_rate": 6.149068322981367e-07, | |
| "loss": 0.0013, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.2177756428718567, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 62 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 84.296875, | |
| "epoch": 0.391304347826087, | |
| "grad_norm": 4.335031032562256, | |
| "kl": 0.01116943359375, | |
| "learning_rate": 6.08695652173913e-07, | |
| "loss": 0.0011, | |
| "reward": 1.90625, | |
| "reward_mean": 1.90625, | |
| "reward_std": 0.2041158676147461, | |
| "rewards/accuracy_reward": 0.90625, | |
| "rewards/format_reward": 1.0, | |
| "step": 63 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 86.828125, | |
| "epoch": 0.39751552795031053, | |
| "grad_norm": 4.443232536315918, | |
| "kl": 0.01336669921875, | |
| "learning_rate": 6.024844720496894e-07, | |
| "loss": 0.0013, | |
| "reward": 1.703125, | |
| "reward_mean": 1.703125, | |
| "reward_std": 0.19939783215522766, | |
| "rewards/accuracy_reward": 0.703125, | |
| "rewards/format_reward": 1.0, | |
| "step": 64 | |
| }, | |
| { | |
| "advantages": 5.587935447692871e-09, | |
| "completion_length": 75.71875, | |
| "epoch": 0.40372670807453415, | |
| "grad_norm": 7.092515468597412, | |
| "kl": 0.01251220703125, | |
| "learning_rate": 5.962732919254659e-07, | |
| "loss": 0.0013, | |
| "reward": 1.65625, | |
| "reward_mean": 1.65625, | |
| "reward_std": 0.23827511072158813, | |
| "rewards/accuracy_reward": 0.65625, | |
| "rewards/format_reward": 1.0, | |
| "step": 65 | |
| }, | |
| { | |
| "advantages": 4.6566128730773926e-09, | |
| "completion_length": 82.140625, | |
| "epoch": 0.40993788819875776, | |
| "grad_norm": 4.468729496002197, | |
| "kl": 0.0211181640625, | |
| "learning_rate": 5.900621118012422e-07, | |
| "loss": 0.0021, | |
| "reward": 1.796875, | |
| "reward_mean": 1.796875, | |
| "reward_std": 0.0646936446428299, | |
| "rewards/accuracy_reward": 0.796875, | |
| "rewards/format_reward": 1.0, | |
| "step": 66 | |
| }, | |
| { | |
| "advantages": 4.6566128730773926e-09, | |
| "completion_length": 74.890625, | |
| "epoch": 0.4161490683229814, | |
| "grad_norm": 9.289567947387695, | |
| "kl": 0.01611328125, | |
| "learning_rate": 5.838509316770186e-07, | |
| "loss": 0.0016, | |
| "reward": 1.421875, | |
| "reward_mean": 1.421875, | |
| "reward_std": 0.1983242630958557, | |
| "rewards/accuracy_reward": 0.421875, | |
| "rewards/format_reward": 1.0, | |
| "step": 67 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 77.625, | |
| "epoch": 0.422360248447205, | |
| "grad_norm": 0.4326918125152588, | |
| "kl": 0.0140380859375, | |
| "learning_rate": 5.77639751552795e-07, | |
| "loss": 0.0014, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 68 | |
| }, | |
| { | |
| "advantages": -9.313225746154785e-10, | |
| "completion_length": 81.71875, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 5.539842128753662, | |
| "kl": 0.04296875, | |
| "learning_rate": 5.714285714285714e-07, | |
| "loss": 0.0043, | |
| "reward": 1.4375, | |
| "reward_mean": 1.4375, | |
| "reward_std": 0.34352827072143555, | |
| "rewards/accuracy_reward": 0.453125, | |
| "rewards/format_reward": 0.984375, | |
| "step": 69 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 90.703125, | |
| "epoch": 0.43478260869565216, | |
| "grad_norm": 0.46686819195747375, | |
| "kl": 0.0074462890625, | |
| "learning_rate": 5.652173913043477e-07, | |
| "loss": 0.0007, | |
| "reward": 1.875, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.875, | |
| "rewards/format_reward": 1.0, | |
| "step": 70 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 83.578125, | |
| "epoch": 0.4409937888198758, | |
| "grad_norm": 8.54028606414795, | |
| "kl": 0.00897216796875, | |
| "learning_rate": 5.590062111801241e-07, | |
| "loss": 0.0009, | |
| "reward": 1.765625, | |
| "reward_mean": 1.765625, | |
| "reward_std": 0.04419417306780815, | |
| "rewards/accuracy_reward": 0.765625, | |
| "rewards/format_reward": 1.0, | |
| "step": 71 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 84.25, | |
| "epoch": 0.4472049689440994, | |
| "grad_norm": 12.895256996154785, | |
| "kl": 0.00579833984375, | |
| "learning_rate": 5.527950310559007e-07, | |
| "loss": 0.0006, | |
| "reward": 1.453125, | |
| "reward_mean": 1.453125, | |
| "reward_std": 0.12255740165710449, | |
| "rewards/accuracy_reward": 0.453125, | |
| "rewards/format_reward": 1.0, | |
| "step": 72 | |
| }, | |
| { | |
| "advantages": -9.313225746154785e-09, | |
| "completion_length": 77.15625, | |
| "epoch": 0.453416149068323, | |
| "grad_norm": 5.548634052276611, | |
| "kl": 0.0123291015625, | |
| "learning_rate": 5.46583850931677e-07, | |
| "loss": 0.0012, | |
| "reward": 1.796875, | |
| "reward_mean": 1.796875, | |
| "reward_std": 0.31983357667922974, | |
| "rewards/accuracy_reward": 0.796875, | |
| "rewards/format_reward": 1.0, | |
| "step": 73 | |
| }, | |
| { | |
| "advantages": -1.0244548320770264e-08, | |
| "completion_length": 84.75, | |
| "epoch": 0.45962732919254656, | |
| "grad_norm": 3.4154112339019775, | |
| "kl": 0.018798828125, | |
| "learning_rate": 5.403726708074534e-07, | |
| "loss": 0.0019, | |
| "reward": 1.78125, | |
| "reward_mean": 1.78125, | |
| "reward_std": 0.19727616012096405, | |
| "rewards/accuracy_reward": 0.78125, | |
| "rewards/format_reward": 1.0, | |
| "step": 74 | |
| }, | |
| { | |
| "advantages": -4.6566128730773926e-09, | |
| "completion_length": 83.015625, | |
| "epoch": 0.4658385093167702, | |
| "grad_norm": 3.4328691959381104, | |
| "kl": 0.01275634765625, | |
| "learning_rate": 5.341614906832298e-07, | |
| "loss": 0.0013, | |
| "reward": 1.53125, | |
| "reward_mean": 1.53125, | |
| "reward_std": 0.23356688022613525, | |
| "rewards/accuracy_reward": 0.53125, | |
| "rewards/format_reward": 1.0, | |
| "step": 75 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 78.609375, | |
| "epoch": 0.4720496894409938, | |
| "grad_norm": 3.627190113067627, | |
| "kl": 0.0142822265625, | |
| "learning_rate": 5.279503105590062e-07, | |
| "loss": 0.0014, | |
| "reward": 1.9375, | |
| "reward_mean": 1.9375, | |
| "reward_std": 0.1462521106004715, | |
| "rewards/accuracy_reward": 0.9375, | |
| "rewards/format_reward": 1.0, | |
| "step": 76 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 80.46875, | |
| "epoch": 0.4782608695652174, | |
| "grad_norm": 10.168981552124023, | |
| "kl": 0.01251220703125, | |
| "learning_rate": 5.217391304347825e-07, | |
| "loss": 0.0013, | |
| "reward": 1.515625, | |
| "reward_mean": 1.515625, | |
| "reward_std": 0.2109457552433014, | |
| "rewards/accuracy_reward": 0.515625, | |
| "rewards/format_reward": 1.0, | |
| "step": 77 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 83.421875, | |
| "epoch": 0.484472049689441, | |
| "grad_norm": 20.923242568969727, | |
| "kl": 0.0113525390625, | |
| "learning_rate": 5.15527950310559e-07, | |
| "loss": 0.0011, | |
| "reward": 1.828125, | |
| "reward_mean": 1.828125, | |
| "reward_std": 0.19044628739356995, | |
| "rewards/accuracy_reward": 0.828125, | |
| "rewards/format_reward": 1.0, | |
| "step": 78 | |
| }, | |
| { | |
| "advantages": 2.7939677238464355e-09, | |
| "completion_length": 75.109375, | |
| "epoch": 0.4906832298136646, | |
| "grad_norm": 3.643770933151245, | |
| "kl": 0.00811767578125, | |
| "learning_rate": 5.093167701863354e-07, | |
| "loss": 0.0008, | |
| "reward": 1.78125, | |
| "reward_mean": 1.78125, | |
| "reward_std": 0.10888782143592834, | |
| "rewards/accuracy_reward": 0.78125, | |
| "rewards/format_reward": 1.0, | |
| "step": 79 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 81.65625, | |
| "epoch": 0.4968944099378882, | |
| "grad_norm": 4.883938312530518, | |
| "kl": 0.015625, | |
| "learning_rate": 5.031055900621117e-07, | |
| "loss": 0.0016, | |
| "reward": 1.25, | |
| "reward_mean": 1.25, | |
| "reward_std": 0.2130674123764038, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 80 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 82.515625, | |
| "epoch": 0.5031055900621118, | |
| "grad_norm": 1.3860398530960083, | |
| "kl": 0.00799560546875, | |
| "learning_rate": 4.968944099378881e-07, | |
| "loss": 0.0008, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.0578637570142746, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 81 | |
| }, | |
| { | |
| "advantages": 9.313225746154785e-10, | |
| "completion_length": 77.6875, | |
| "epoch": 0.5093167701863354, | |
| "grad_norm": 3.7328872680664062, | |
| "kl": 0.0181884765625, | |
| "learning_rate": 4.906832298136646e-07, | |
| "loss": 0.0018, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.16675157845020294, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 82 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 76.4375, | |
| "epoch": 0.515527950310559, | |
| "grad_norm": 3.6228644847869873, | |
| "kl": 0.01446533203125, | |
| "learning_rate": 4.84472049689441e-07, | |
| "loss": 0.0014, | |
| "reward": 1.46875, | |
| "reward_mean": 1.46875, | |
| "reward_std": 0.1246790662407875, | |
| "rewards/accuracy_reward": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "step": 83 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 79.140625, | |
| "epoch": 0.5217391304347826, | |
| "grad_norm": 5.579171180725098, | |
| "kl": 0.0159912109375, | |
| "learning_rate": 4.782608695652174e-07, | |
| "loss": 0.0016, | |
| "reward": 1.65625, | |
| "reward_mean": 1.65625, | |
| "reward_std": 0.23356688022613525, | |
| "rewards/accuracy_reward": 0.671875, | |
| "rewards/format_reward": 0.984375, | |
| "step": 84 | |
| }, | |
| { | |
| "advantages": -5.587935447692871e-09, | |
| "completion_length": 80.0, | |
| "epoch": 0.5279503105590062, | |
| "grad_norm": 9.611387252807617, | |
| "kl": 0.01080322265625, | |
| "learning_rate": 4.7204968944099376e-07, | |
| "loss": 0.0011, | |
| "reward": 1.828125, | |
| "reward_mean": 1.828125, | |
| "reward_std": 0.13258251547813416, | |
| "rewards/accuracy_reward": 0.828125, | |
| "rewards/format_reward": 1.0, | |
| "step": 85 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 85.5, | |
| "epoch": 0.5341614906832298, | |
| "grad_norm": 4.1448540687561035, | |
| "kl": 0.01007080078125, | |
| "learning_rate": 4.6583850931677014e-07, | |
| "loss": 0.001, | |
| "reward": 1.859375, | |
| "reward_mean": 1.859375, | |
| "reward_std": 0.17358146607875824, | |
| "rewards/accuracy_reward": 0.859375, | |
| "rewards/format_reward": 1.0, | |
| "step": 86 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 75.5, | |
| "epoch": 0.5403726708074534, | |
| "grad_norm": 5.654483795166016, | |
| "kl": 0.01123046875, | |
| "learning_rate": 4.596273291925465e-07, | |
| "loss": 0.0011, | |
| "reward": 1.796875, | |
| "reward_mean": 1.796875, | |
| "reward_std": 0.1530819982290268, | |
| "rewards/accuracy_reward": 0.796875, | |
| "rewards/format_reward": 1.0, | |
| "step": 87 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 72.671875, | |
| "epoch": 0.546583850931677, | |
| "grad_norm": 2.2370052337646484, | |
| "kl": 0.0137939453125, | |
| "learning_rate": 4.53416149068323e-07, | |
| "loss": 0.0014, | |
| "reward": 1.46875, | |
| "reward_mean": 1.46875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.46875, | |
| "rewards/format_reward": 1.0, | |
| "step": 88 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 81.84375, | |
| "epoch": 0.5527950310559007, | |
| "grad_norm": 1.389394760131836, | |
| "kl": 0.00836181640625, | |
| "learning_rate": 4.472049689440994e-07, | |
| "loss": 0.0008, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.06681530922651291, | |
| "rewards/accuracy_reward": 0.765625, | |
| "rewards/format_reward": 0.984375, | |
| "step": 89 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 74.59375, | |
| "epoch": 0.5590062111801242, | |
| "grad_norm": 2.353760242462158, | |
| "kl": 0.00811767578125, | |
| "learning_rate": 4.4099378881987576e-07, | |
| "loss": 0.0008, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.06681530922651291, | |
| "rewards/accuracy_reward": 0.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 90 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 85.28125, | |
| "epoch": 0.5652173913043478, | |
| "grad_norm": 1.5767848491668701, | |
| "kl": 0.009765625, | |
| "learning_rate": 4.3478260869565214e-07, | |
| "loss": 0.001, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 91 | |
| }, | |
| { | |
| "advantages": -8.381903171539307e-09, | |
| "completion_length": 81.859375, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 3.835320234298706, | |
| "kl": 0.0181884765625, | |
| "learning_rate": 4.285714285714285e-07, | |
| "loss": 0.0018, | |
| "reward": 1.671875, | |
| "reward_mean": 1.671875, | |
| "reward_std": 0.1530819982290268, | |
| "rewards/accuracy_reward": 0.6875, | |
| "rewards/format_reward": 0.984375, | |
| "step": 92 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 83.671875, | |
| "epoch": 0.577639751552795, | |
| "grad_norm": 9.30271053314209, | |
| "kl": 0.017822265625, | |
| "learning_rate": 4.2236024844720495e-07, | |
| "loss": 0.0018, | |
| "reward": 1.796875, | |
| "reward_mean": 1.796875, | |
| "reward_std": 0.23144522309303284, | |
| "rewards/accuracy_reward": 0.796875, | |
| "rewards/format_reward": 1.0, | |
| "step": 93 | |
| }, | |
| { | |
| "advantages": -4.6566128730773926e-09, | |
| "completion_length": 77.53125, | |
| "epoch": 0.5838509316770186, | |
| "grad_norm": 6.170975685119629, | |
| "kl": 0.009521484375, | |
| "learning_rate": 4.161490683229814e-07, | |
| "loss": 0.001, | |
| "reward": 1.65625, | |
| "reward_mean": 1.65625, | |
| "reward_std": 0.16675157845020294, | |
| "rewards/accuracy_reward": 0.65625, | |
| "rewards/format_reward": 1.0, | |
| "step": 94 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 85.640625, | |
| "epoch": 0.5900621118012422, | |
| "grad_norm": 4.217593669891357, | |
| "kl": 0.01409912109375, | |
| "learning_rate": 4.0993788819875776e-07, | |
| "loss": 0.0014, | |
| "reward": 1.734375, | |
| "reward_mean": 1.734375, | |
| "reward_std": 0.15992169082164764, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 0.984375, | |
| "step": 95 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 77.296875, | |
| "epoch": 0.5962732919254659, | |
| "grad_norm": 6.138365268707275, | |
| "kl": 0.0106201171875, | |
| "learning_rate": 4.0372670807453413e-07, | |
| "loss": 0.0011, | |
| "reward": 1.375, | |
| "reward_mean": 1.375, | |
| "reward_std": 0.06681530922651291, | |
| "rewards/accuracy_reward": 0.390625, | |
| "rewards/format_reward": 0.984375, | |
| "step": 96 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 76.484375, | |
| "epoch": 0.6024844720496895, | |
| "grad_norm": 1.2896429300308228, | |
| "kl": 0.00970458984375, | |
| "learning_rate": 3.975155279503105e-07, | |
| "loss": 0.001, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.734375, | |
| "rewards/format_reward": 0.984375, | |
| "step": 97 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 81.71875, | |
| "epoch": 0.6086956521739131, | |
| "grad_norm": 6.941093444824219, | |
| "kl": 0.01165771484375, | |
| "learning_rate": 3.9130434782608694e-07, | |
| "loss": 0.0012, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.0578637570142746, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 98 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 81.390625, | |
| "epoch": 0.6149068322981367, | |
| "grad_norm": 3.163457155227661, | |
| "kl": 0.00787353515625, | |
| "learning_rate": 3.850931677018633e-07, | |
| "loss": 0.0008, | |
| "reward": 1.96875, | |
| "reward_mean": 1.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.96875, | |
| "rewards/format_reward": 1.0, | |
| "step": 99 | |
| }, | |
| { | |
| "advantages": 4.6566128730773926e-09, | |
| "completion_length": 79.1875, | |
| "epoch": 0.6211180124223602, | |
| "grad_norm": 4.2669830322265625, | |
| "kl": 0.0108642578125, | |
| "learning_rate": 3.7888198757763975e-07, | |
| "loss": 0.0011, | |
| "reward": 1.671875, | |
| "reward_mean": 1.671875, | |
| "reward_std": 0.0646936446428299, | |
| "rewards/accuracy_reward": 0.671875, | |
| "rewards/format_reward": 1.0, | |
| "step": 100 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 77.578125, | |
| "epoch": 0.6273291925465838, | |
| "grad_norm": 6.153615474700928, | |
| "kl": 0.0101318359375, | |
| "learning_rate": 3.7267080745341613e-07, | |
| "loss": 0.001, | |
| "reward": 1.359375, | |
| "reward_mean": 1.359375, | |
| "reward_std": 0.04419417306780815, | |
| "rewards/accuracy_reward": 0.359375, | |
| "rewards/format_reward": 1.0, | |
| "step": 101 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 81.53125, | |
| "epoch": 0.6335403726708074, | |
| "grad_norm": 4.077609539031982, | |
| "kl": 0.0181884765625, | |
| "learning_rate": 3.6645962732919256e-07, | |
| "loss": 0.0018, | |
| "reward": 1.84375, | |
| "reward_mean": 1.84375, | |
| "reward_std": 0.2177756428718567, | |
| "rewards/accuracy_reward": 0.84375, | |
| "rewards/format_reward": 1.0, | |
| "step": 102 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 80.375, | |
| "epoch": 0.639751552795031, | |
| "grad_norm": 3.084027051925659, | |
| "kl": 0.01007080078125, | |
| "learning_rate": 3.6024844720496894e-07, | |
| "loss": 0.001, | |
| "reward": 1.53125, | |
| "reward_mean": 1.53125, | |
| "reward_std": 0.1462521106004715, | |
| "rewards/accuracy_reward": 0.53125, | |
| "rewards/format_reward": 1.0, | |
| "step": 103 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 83.953125, | |
| "epoch": 0.6459627329192547, | |
| "grad_norm": 2.0512335300445557, | |
| "kl": 0.007476806640625, | |
| "learning_rate": 3.540372670807453e-07, | |
| "loss": 0.0007, | |
| "reward": 1.453125, | |
| "reward_mean": 1.453125, | |
| "reward_std": 0.0646936446428299, | |
| "rewards/accuracy_reward": 0.453125, | |
| "rewards/format_reward": 1.0, | |
| "step": 104 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 82.5625, | |
| "epoch": 0.6521739130434783, | |
| "grad_norm": 0.5302374362945557, | |
| "kl": 0.00982666015625, | |
| "learning_rate": 3.478260869565217e-07, | |
| "loss": 0.001, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 105 | |
| }, | |
| { | |
| "advantages": -4.6566128730773926e-09, | |
| "completion_length": 75.953125, | |
| "epoch": 0.6583850931677019, | |
| "grad_norm": 6.2678751945495605, | |
| "kl": 0.0111083984375, | |
| "learning_rate": 3.416149068322981e-07, | |
| "loss": 0.0011, | |
| "reward": 1.890625, | |
| "reward_mean": 1.890625, | |
| "reward_std": 0.1315089464187622, | |
| "rewards/accuracy_reward": 0.890625, | |
| "rewards/format_reward": 1.0, | |
| "step": 106 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 78.328125, | |
| "epoch": 0.6645962732919255, | |
| "grad_norm": 1.7859537601470947, | |
| "kl": 0.00946044921875, | |
| "learning_rate": 3.3540372670807456e-07, | |
| "loss": 0.0009, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 107 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 91.8125, | |
| "epoch": 0.6708074534161491, | |
| "grad_norm": 2.7167623043060303, | |
| "kl": 0.0081787109375, | |
| "learning_rate": 3.2919254658385094e-07, | |
| "loss": 0.0008, | |
| "reward": 1.65625, | |
| "reward_mean": 1.65625, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.671875, | |
| "rewards/format_reward": 0.984375, | |
| "step": 108 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 76.984375, | |
| "epoch": 0.6770186335403726, | |
| "grad_norm": 5.3628058433532715, | |
| "kl": 0.009033203125, | |
| "learning_rate": 3.229813664596273e-07, | |
| "loss": 0.0009, | |
| "reward": 1.515625, | |
| "reward_mean": 1.515625, | |
| "reward_std": 0.19044628739356995, | |
| "rewards/accuracy_reward": 0.515625, | |
| "rewards/format_reward": 1.0, | |
| "step": 109 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 73.5, | |
| "epoch": 0.6832298136645962, | |
| "grad_norm": 3.2727582454681396, | |
| "kl": 0.0108642578125, | |
| "learning_rate": 3.167701863354037e-07, | |
| "loss": 0.0011, | |
| "reward": 1.609375, | |
| "reward_mean": 1.609375, | |
| "reward_std": 0.04419417306780815, | |
| "rewards/accuracy_reward": 0.609375, | |
| "rewards/format_reward": 1.0, | |
| "step": 110 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 75.75, | |
| "epoch": 0.6894409937888198, | |
| "grad_norm": 11.552366256713867, | |
| "kl": 0.0145263671875, | |
| "learning_rate": 3.105590062111801e-07, | |
| "loss": 0.0015, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 111 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 85.734375, | |
| "epoch": 0.6956521739130435, | |
| "grad_norm": 6.025736331939697, | |
| "kl": 0.01556396484375, | |
| "learning_rate": 3.043478260869565e-07, | |
| "loss": 0.0016, | |
| "reward": 1.59375, | |
| "reward_mean": 1.59375, | |
| "reward_std": 0.1552036553621292, | |
| "rewards/accuracy_reward": 0.59375, | |
| "rewards/format_reward": 1.0, | |
| "step": 112 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 82.734375, | |
| "epoch": 0.7018633540372671, | |
| "grad_norm": 15.336418151855469, | |
| "kl": 0.057373046875, | |
| "learning_rate": 2.9813664596273294e-07, | |
| "loss": 0.0057, | |
| "reward": 1.84375, | |
| "reward_mean": 1.84375, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.84375, | |
| "rewards/format_reward": 1.0, | |
| "step": 113 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 78.953125, | |
| "epoch": 0.7080745341614907, | |
| "grad_norm": 65.76184844970703, | |
| "kl": 0.01385498046875, | |
| "learning_rate": 2.919254658385093e-07, | |
| "loss": 0.0014, | |
| "reward": 1.90625, | |
| "reward_mean": 1.90625, | |
| "reward_std": 0.1552036553621292, | |
| "rewards/accuracy_reward": 0.90625, | |
| "rewards/format_reward": 1.0, | |
| "step": 114 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 79.765625, | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 3.660456657409668, | |
| "kl": 0.0194091796875, | |
| "learning_rate": 2.857142857142857e-07, | |
| "loss": 0.0019, | |
| "reward": 1.59375, | |
| "reward_mean": 1.59375, | |
| "reward_std": 0.10888782143592834, | |
| "rewards/accuracy_reward": 0.59375, | |
| "rewards/format_reward": 1.0, | |
| "step": 115 | |
| }, | |
| { | |
| "advantages": -5.587935447692871e-09, | |
| "completion_length": 76.34375, | |
| "epoch": 0.7204968944099379, | |
| "grad_norm": 4.989613056182861, | |
| "kl": 0.00787353515625, | |
| "learning_rate": 2.7950310559006207e-07, | |
| "loss": 0.0008, | |
| "reward": 1.828125, | |
| "reward_mean": 1.828125, | |
| "reward_std": 0.13258251547813416, | |
| "rewards/accuracy_reward": 0.84375, | |
| "rewards/format_reward": 0.984375, | |
| "step": 116 | |
| }, | |
| { | |
| "advantages": -9.313225746154785e-10, | |
| "completion_length": 77.859375, | |
| "epoch": 0.7267080745341615, | |
| "grad_norm": 2.4932050704956055, | |
| "kl": 0.0081787109375, | |
| "learning_rate": 2.732919254658385e-07, | |
| "loss": 0.0008, | |
| "reward": 1.859375, | |
| "reward_mean": 1.859375, | |
| "reward_std": 0.12255740165710449, | |
| "rewards/accuracy_reward": 0.859375, | |
| "rewards/format_reward": 1.0, | |
| "step": 117 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 84.5, | |
| "epoch": 0.7329192546583851, | |
| "grad_norm": 5.0420732498168945, | |
| "kl": 0.01226806640625, | |
| "learning_rate": 2.670807453416149e-07, | |
| "loss": 0.0012, | |
| "reward": 1.640625, | |
| "reward_mean": 1.640625, | |
| "reward_std": 0.23144522309303284, | |
| "rewards/accuracy_reward": 0.640625, | |
| "rewards/format_reward": 1.0, | |
| "step": 118 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 79.703125, | |
| "epoch": 0.7391304347826086, | |
| "grad_norm": 3.599855899810791, | |
| "kl": 0.0086669921875, | |
| "learning_rate": 2.6086956521739126e-07, | |
| "loss": 0.0009, | |
| "reward": 1.484375, | |
| "reward_mean": 1.484375, | |
| "reward_std": 0.13258251547813416, | |
| "rewards/accuracy_reward": 0.484375, | |
| "rewards/format_reward": 1.0, | |
| "step": 119 | |
| }, | |
| { | |
| "advantages": -5.587935447692871e-09, | |
| "completion_length": 79.1875, | |
| "epoch": 0.7453416149068323, | |
| "grad_norm": 3.320706605911255, | |
| "kl": 0.0133056640625, | |
| "learning_rate": 2.546583850931677e-07, | |
| "loss": 0.0013, | |
| "reward": 1.828125, | |
| "reward_mean": 1.828125, | |
| "reward_std": 0.10205793380737305, | |
| "rewards/accuracy_reward": 0.828125, | |
| "rewards/format_reward": 1.0, | |
| "step": 120 | |
| }, | |
| { | |
| "advantages": -8.381903171539307e-09, | |
| "completion_length": 95.203125, | |
| "epoch": 0.7515527950310559, | |
| "grad_norm": 2.8366851806640625, | |
| "kl": 0.0064697265625, | |
| "learning_rate": 2.4844720496894407e-07, | |
| "loss": 0.0006, | |
| "reward": 1.671875, | |
| "reward_mean": 1.671875, | |
| "reward_std": 0.1530819833278656, | |
| "rewards/accuracy_reward": 0.671875, | |
| "rewards/format_reward": 1.0, | |
| "step": 121 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 78.765625, | |
| "epoch": 0.7577639751552795, | |
| "grad_norm": 3.376732587814331, | |
| "kl": 0.00860595703125, | |
| "learning_rate": 2.422360248447205e-07, | |
| "loss": 0.0009, | |
| "reward": 1.640625, | |
| "reward_mean": 1.640625, | |
| "reward_std": 0.10205793380737305, | |
| "rewards/accuracy_reward": 0.640625, | |
| "rewards/format_reward": 1.0, | |
| "step": 122 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 79.453125, | |
| "epoch": 0.7639751552795031, | |
| "grad_norm": 3.5682129859924316, | |
| "kl": 0.018798828125, | |
| "learning_rate": 2.3602484472049688e-07, | |
| "loss": 0.0019, | |
| "reward": 1.671875, | |
| "reward_mean": 1.671875, | |
| "reward_std": 0.1804211586713791, | |
| "rewards/accuracy_reward": 0.671875, | |
| "rewards/format_reward": 1.0, | |
| "step": 123 | |
| }, | |
| { | |
| "advantages": 4.6566128730773926e-09, | |
| "completion_length": 74.96875, | |
| "epoch": 0.7701863354037267, | |
| "grad_norm": 2.6698434352874756, | |
| "kl": 0.006256103515625, | |
| "learning_rate": 2.2981366459627326e-07, | |
| "loss": 0.0006, | |
| "reward": 1.546875, | |
| "reward_mean": 1.546875, | |
| "reward_std": 0.0646936446428299, | |
| "rewards/accuracy_reward": 0.546875, | |
| "rewards/format_reward": 1.0, | |
| "step": 124 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 78.421875, | |
| "epoch": 0.7763975155279503, | |
| "grad_norm": 3.1063811779022217, | |
| "kl": 0.01214599609375, | |
| "learning_rate": 2.236024844720497e-07, | |
| "loss": 0.0012, | |
| "reward": 1.765625, | |
| "reward_mean": 1.765625, | |
| "reward_std": 0.12255740165710449, | |
| "rewards/accuracy_reward": 0.765625, | |
| "rewards/format_reward": 1.0, | |
| "step": 125 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 86.703125, | |
| "epoch": 0.782608695652174, | |
| "grad_norm": 2.7392446994781494, | |
| "kl": 0.00634765625, | |
| "learning_rate": 2.1739130434782607e-07, | |
| "loss": 0.0006, | |
| "reward": 1.640625, | |
| "reward_mean": 1.640625, | |
| "reward_std": 0.04419417306780815, | |
| "rewards/accuracy_reward": 0.640625, | |
| "rewards/format_reward": 1.0, | |
| "step": 126 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 83.546875, | |
| "epoch": 0.7888198757763976, | |
| "grad_norm": 9.345684051513672, | |
| "kl": 0.008056640625, | |
| "learning_rate": 2.1118012422360247e-07, | |
| "loss": 0.0008, | |
| "reward": 1.296875, | |
| "reward_mean": 1.296875, | |
| "reward_std": 0.19044628739356995, | |
| "rewards/accuracy_reward": 0.296875, | |
| "rewards/format_reward": 1.0, | |
| "step": 127 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 85.84375, | |
| "epoch": 0.7950310559006211, | |
| "grad_norm": 0.22835175693035126, | |
| "kl": 0.0084228515625, | |
| "learning_rate": 2.0496894409937888e-07, | |
| "loss": 0.0008, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 128 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 81.828125, | |
| "epoch": 0.8012422360248447, | |
| "grad_norm": 2.44989275932312, | |
| "kl": 0.007171630859375, | |
| "learning_rate": 1.9875776397515526e-07, | |
| "loss": 0.0007, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 129 | |
| }, | |
| { | |
| "advantages": 9.313225746154785e-10, | |
| "completion_length": 83.296875, | |
| "epoch": 0.8074534161490683, | |
| "grad_norm": 26.60379409790039, | |
| "kl": 0.01031494140625, | |
| "learning_rate": 1.9254658385093166e-07, | |
| "loss": 0.001, | |
| "reward": 1.640625, | |
| "reward_mean": 1.640625, | |
| "reward_std": 0.1530819982290268, | |
| "rewards/accuracy_reward": 0.640625, | |
| "rewards/format_reward": 1.0, | |
| "step": 130 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 75.296875, | |
| "epoch": 0.8136645962732919, | |
| "grad_norm": 2.649775981903076, | |
| "kl": 0.00787353515625, | |
| "learning_rate": 1.8633540372670807e-07, | |
| "loss": 0.0008, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/format_reward": 1.0, | |
| "step": 131 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 72.921875, | |
| "epoch": 0.8198757763975155, | |
| "grad_norm": 6.021523952484131, | |
| "kl": 0.017333984375, | |
| "learning_rate": 1.8012422360248447e-07, | |
| "loss": 0.0017, | |
| "reward": 1.546875, | |
| "reward_mean": 1.546875, | |
| "reward_std": 0.17358146607875824, | |
| "rewards/accuracy_reward": 0.546875, | |
| "rewards/format_reward": 1.0, | |
| "step": 132 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 80.203125, | |
| "epoch": 0.8260869565217391, | |
| "grad_norm": 5.850553035736084, | |
| "kl": 0.0118408203125, | |
| "learning_rate": 1.7391304347826085e-07, | |
| "loss": 0.0012, | |
| "reward": 1.671875, | |
| "reward_mean": 1.671875, | |
| "reward_std": 0.25726157426834106, | |
| "rewards/accuracy_reward": 0.671875, | |
| "rewards/format_reward": 1.0, | |
| "step": 133 | |
| }, | |
| { | |
| "advantages": -5.587935447692871e-09, | |
| "completion_length": 90.234375, | |
| "epoch": 0.8322981366459627, | |
| "grad_norm": 9.700899124145508, | |
| "kl": 0.01422119140625, | |
| "learning_rate": 1.6770186335403728e-07, | |
| "loss": 0.0014, | |
| "reward": 1.78125, | |
| "reward_mean": 1.78125, | |
| "reward_std": 0.2651650309562683, | |
| "rewards/accuracy_reward": 0.828125, | |
| "rewards/format_reward": 0.953125, | |
| "step": 134 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 81.90625, | |
| "epoch": 0.8385093167701864, | |
| "grad_norm": 2.9975473880767822, | |
| "kl": 0.00909423828125, | |
| "learning_rate": 1.6149068322981366e-07, | |
| "loss": 0.0009, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.1552036553621292, | |
| "rewards/accuracy_reward": 0.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 135 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 79.3125, | |
| "epoch": 0.84472049689441, | |
| "grad_norm": 4.324582099914551, | |
| "kl": 0.01165771484375, | |
| "learning_rate": 1.5527950310559004e-07, | |
| "loss": 0.0012, | |
| "reward": 1.84375, | |
| "reward_mean": 1.84375, | |
| "reward_std": 0.2177756428718567, | |
| "rewards/accuracy_reward": 0.859375, | |
| "rewards/format_reward": 0.984375, | |
| "step": 136 | |
| }, | |
| { | |
| "advantages": 7.450580596923828e-09, | |
| "completion_length": 80.265625, | |
| "epoch": 0.8509316770186336, | |
| "grad_norm": 3.8911736011505127, | |
| "kl": 0.009521484375, | |
| "learning_rate": 1.4906832298136647e-07, | |
| "loss": 0.001, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.1552036553621292, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 137 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 78.40625, | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 2.864941120147705, | |
| "kl": 0.01129150390625, | |
| "learning_rate": 1.4285714285714285e-07, | |
| "loss": 0.0011, | |
| "reward": 1.765625, | |
| "reward_mean": 1.765625, | |
| "reward_std": 0.10205793380737305, | |
| "rewards/accuracy_reward": 0.765625, | |
| "rewards/format_reward": 1.0, | |
| "step": 138 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 80.21875, | |
| "epoch": 0.8633540372670807, | |
| "grad_norm": 5.788990497589111, | |
| "kl": 0.00799560546875, | |
| "learning_rate": 1.3664596273291925e-07, | |
| "loss": 0.0008, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.1552036553621292, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 139 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 92.90625, | |
| "epoch": 0.8695652173913043, | |
| "grad_norm": 4.130926609039307, | |
| "kl": 0.0111083984375, | |
| "learning_rate": 1.3043478260869563e-07, | |
| "loss": 0.0011, | |
| "reward": 1.859375, | |
| "reward_mean": 1.859375, | |
| "reward_std": 0.2198973000049591, | |
| "rewards/accuracy_reward": 0.859375, | |
| "rewards/format_reward": 1.0, | |
| "step": 140 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 79.109375, | |
| "epoch": 0.8757763975155279, | |
| "grad_norm": 3.025212287902832, | |
| "kl": 0.01324462890625, | |
| "learning_rate": 1.2422360248447204e-07, | |
| "loss": 0.0013, | |
| "reward": 1.46875, | |
| "reward_mean": 1.46875, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/accuracy_reward": 0.484375, | |
| "rewards/format_reward": 0.984375, | |
| "step": 141 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 78.25, | |
| "epoch": 0.8819875776397516, | |
| "grad_norm": 6.828762531280518, | |
| "kl": 0.01177978515625, | |
| "learning_rate": 1.1801242236024844e-07, | |
| "loss": 0.0012, | |
| "reward": 1.40625, | |
| "reward_mean": 1.40625, | |
| "reward_std": 0.1462520956993103, | |
| "rewards/accuracy_reward": 0.40625, | |
| "rewards/format_reward": 1.0, | |
| "step": 142 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 73.625, | |
| "epoch": 0.8881987577639752, | |
| "grad_norm": 3.4486515522003174, | |
| "kl": 0.00762939453125, | |
| "learning_rate": 1.1180124223602484e-07, | |
| "loss": 0.0008, | |
| "reward": 1.65625, | |
| "reward_mean": 1.65625, | |
| "reward_std": 0.1552036553621292, | |
| "rewards/accuracy_reward": 0.65625, | |
| "rewards/format_reward": 1.0, | |
| "step": 143 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 80.359375, | |
| "epoch": 0.8944099378881988, | |
| "grad_norm": 8.272978782653809, | |
| "kl": 0.00628662109375, | |
| "learning_rate": 1.0559006211180124e-07, | |
| "loss": 0.0006, | |
| "reward": 1.71875, | |
| "reward_mean": 1.71875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.734375, | |
| "rewards/format_reward": 0.984375, | |
| "step": 144 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 81.1875, | |
| "epoch": 0.9006211180124224, | |
| "grad_norm": 4.848587512969971, | |
| "kl": 0.017578125, | |
| "learning_rate": 9.937888198757763e-08, | |
| "loss": 0.0018, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.2041158676147461, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 145 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 79.984375, | |
| "epoch": 0.906832298136646, | |
| "grad_norm": 0.3604845702648163, | |
| "kl": 0.00958251953125, | |
| "learning_rate": 9.316770186335403e-08, | |
| "loss": 0.001, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 146 | |
| }, | |
| { | |
| "advantages": 5.587935447692871e-09, | |
| "completion_length": 76.5625, | |
| "epoch": 0.9130434782608695, | |
| "grad_norm": 10.680438995361328, | |
| "kl": 0.01116943359375, | |
| "learning_rate": 8.695652173913042e-08, | |
| "loss": 0.0011, | |
| "reward": 1.671875, | |
| "reward_mean": 1.671875, | |
| "reward_std": 0.19939783215522766, | |
| "rewards/accuracy_reward": 0.671875, | |
| "rewards/format_reward": 1.0, | |
| "step": 147 | |
| }, | |
| { | |
| "advantages": -3.725290298461914e-09, | |
| "completion_length": 76.984375, | |
| "epoch": 0.9192546583850931, | |
| "grad_norm": 2.091907024383545, | |
| "kl": 0.010498046875, | |
| "learning_rate": 8.074534161490683e-08, | |
| "loss": 0.0011, | |
| "reward": 1.640625, | |
| "reward_mean": 1.640625, | |
| "reward_std": 0.08010874688625336, | |
| "rewards/accuracy_reward": 0.65625, | |
| "rewards/format_reward": 0.984375, | |
| "step": 148 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 76.203125, | |
| "epoch": 0.9254658385093167, | |
| "grad_norm": 0.20045147836208344, | |
| "kl": 0.0078125, | |
| "learning_rate": 7.453416149068323e-08, | |
| "loss": 0.0008, | |
| "reward": 1.75, | |
| "reward_mean": 1.75, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/format_reward": 1.0, | |
| "step": 149 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 84.140625, | |
| "epoch": 0.9316770186335404, | |
| "grad_norm": 3.21720814704895, | |
| "kl": 0.008544921875, | |
| "learning_rate": 6.832298136645963e-08, | |
| "loss": 0.0009, | |
| "reward": 1.6875, | |
| "reward_mean": 1.6875, | |
| "reward_std": 0.06681530922651291, | |
| "rewards/accuracy_reward": 0.6875, | |
| "rewards/format_reward": 1.0, | |
| "step": 150 | |
| }, | |
| { | |
| "advantages": -1.862645149230957e-09, | |
| "completion_length": 82.734375, | |
| "epoch": 0.937888198757764, | |
| "grad_norm": 7.955801963806152, | |
| "kl": 0.01263427734375, | |
| "learning_rate": 6.211180124223602e-08, | |
| "loss": 0.0013, | |
| "reward": 1.734375, | |
| "reward_mean": 1.734375, | |
| "reward_std": 0.10205793380737305, | |
| "rewards/accuracy_reward": 0.734375, | |
| "rewards/format_reward": 1.0, | |
| "step": 151 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 72.96875, | |
| "epoch": 0.9440993788819876, | |
| "grad_norm": 3.563530921936035, | |
| "kl": 0.0093994140625, | |
| "learning_rate": 5.590062111801242e-08, | |
| "loss": 0.0009, | |
| "reward": 1.90625, | |
| "reward_mean": 1.90625, | |
| "reward_std": 0.0578637570142746, | |
| "rewards/accuracy_reward": 0.90625, | |
| "rewards/format_reward": 1.0, | |
| "step": 152 | |
| }, | |
| { | |
| "advantages": -9.313225746154785e-09, | |
| "completion_length": 83.125, | |
| "epoch": 0.9503105590062112, | |
| "grad_norm": 9.811988830566406, | |
| "kl": 0.0184326171875, | |
| "learning_rate": 4.9689440993788814e-08, | |
| "loss": 0.0019, | |
| "reward": 1.796875, | |
| "reward_mean": 1.796875, | |
| "reward_std": 0.15992169082164764, | |
| "rewards/accuracy_reward": 0.796875, | |
| "rewards/format_reward": 1.0, | |
| "step": 153 | |
| }, | |
| { | |
| "advantages": -1.0244548320770264e-08, | |
| "completion_length": 83.515625, | |
| "epoch": 0.9565217391304348, | |
| "grad_norm": 3.8269639015197754, | |
| "kl": 0.0133056640625, | |
| "learning_rate": 4.347826086956521e-08, | |
| "loss": 0.0013, | |
| "reward": 1.78125, | |
| "reward_mean": 1.78125, | |
| "reward_std": 0.16675157845020294, | |
| "rewards/accuracy_reward": 0.78125, | |
| "rewards/format_reward": 1.0, | |
| "step": 154 | |
| }, | |
| { | |
| "advantages": -7.450580596923828e-09, | |
| "completion_length": 85.671875, | |
| "epoch": 0.9627329192546584, | |
| "grad_norm": 3.470165252685547, | |
| "kl": 0.01300048828125, | |
| "learning_rate": 3.726708074534162e-08, | |
| "loss": 0.0013, | |
| "reward": 1.5625, | |
| "reward_mean": 1.5625, | |
| "reward_std": 0.1462520956993103, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 1.0, | |
| "step": 155 | |
| }, | |
| { | |
| "advantages": -9.313225746154785e-10, | |
| "completion_length": 84.203125, | |
| "epoch": 0.968944099378882, | |
| "grad_norm": 2.550407648086548, | |
| "kl": 0.00946044921875, | |
| "learning_rate": 3.105590062111801e-08, | |
| "loss": 0.0009, | |
| "reward": 1.625, | |
| "reward_mean": 1.625, | |
| "reward_std": 0.16675157845020294, | |
| "rewards/accuracy_reward": 0.640625, | |
| "rewards/format_reward": 0.984375, | |
| "step": 156 | |
| }, | |
| { | |
| "advantages": 3.725290298461914e-09, | |
| "completion_length": 76.296875, | |
| "epoch": 0.9751552795031055, | |
| "grad_norm": 3.396425247192383, | |
| "kl": 0.00714111328125, | |
| "learning_rate": 2.4844720496894407e-08, | |
| "loss": 0.0007, | |
| "reward": 1.546875, | |
| "reward_mean": 1.546875, | |
| "reward_std": 0.15992169082164764, | |
| "rewards/accuracy_reward": 0.546875, | |
| "rewards/format_reward": 1.0, | |
| "step": 157 | |
| }, | |
| { | |
| "advantages": -2.7939677238464355e-09, | |
| "completion_length": 73.859375, | |
| "epoch": 0.9813664596273292, | |
| "grad_norm": 3.776041030883789, | |
| "kl": 0.00921630859375, | |
| "learning_rate": 1.863354037267081e-08, | |
| "loss": 0.0009, | |
| "reward": 1.59375, | |
| "reward_mean": 1.59375, | |
| "reward_std": 0.10888782143592834, | |
| "rewards/accuracy_reward": 0.59375, | |
| "rewards/format_reward": 1.0, | |
| "step": 158 | |
| }, | |
| { | |
| "advantages": 1.862645149230957e-09, | |
| "completion_length": 77.953125, | |
| "epoch": 0.9875776397515528, | |
| "grad_norm": 3.304471254348755, | |
| "kl": 0.01263427734375, | |
| "learning_rate": 1.2422360248447204e-08, | |
| "loss": 0.0013, | |
| "reward": 1.796875, | |
| "reward_mean": 1.796875, | |
| "reward_std": 0.11100947856903076, | |
| "rewards/accuracy_reward": 0.796875, | |
| "rewards/format_reward": 1.0, | |
| "step": 159 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "completion_length": 79.25, | |
| "epoch": 0.9937888198757764, | |
| "grad_norm": 5.823967456817627, | |
| "kl": 0.00897216796875, | |
| "learning_rate": 6.211180124223602e-09, | |
| "loss": 0.0009, | |
| "reward": 1.5, | |
| "reward_mean": 1.5, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 1.0, | |
| "step": 160 | |
| }, | |
| { | |
| "advantages": -0.5890890955924988, | |
| "completion_length": 89.33333587646484, | |
| "epoch": 1.0, | |
| "grad_norm": 2.0931286811828613, | |
| "kl": 0.00677490234375, | |
| "learning_rate": 0.0, | |
| "loss": 0.001, | |
| "reward": 1.6666667461395264, | |
| "reward_mean": 1.875, | |
| "reward_std": 0.3535533845424652, | |
| "rewards/accuracy_reward": 0.6666666865348816, | |
| "rewards/format_reward": 1.0, | |
| "step": 161 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 161, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |