| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.8695652173913043, |
| "eval_steps": 500, |
| "global_step": 100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 208.5885467529297, |
| "epoch": 0.008695652173913044, |
| "grad_norm": 2.329385757446289, |
| "kl": 0.0, |
| "learning_rate": 9.956521739130434e-07, |
| "loss": 0.0, |
| "reward": 1.7678645849227905, |
| "reward_std": 0.3930448889732361, |
| "rewards/correctness_reward_vllm": 0.7756770849227905, |
| "rewards/length_reward": 0.9921875, |
| "step": 1 |
| }, |
| { |
| "completion_length": 199.15625, |
| "epoch": 0.017391304347826087, |
| "grad_norm": 3.4958019256591797, |
| "kl": 0.0003414154052734375, |
| "learning_rate": 9.91304347826087e-07, |
| "loss": 0.0, |
| "reward": 1.792708396911621, |
| "reward_std": 0.37898510694503784, |
| "rewards/correctness_reward_vllm": 0.7994792461395264, |
| "rewards/length_reward": 0.9932291507720947, |
| "step": 2 |
| }, |
| { |
| "completion_length": 201.4166717529297, |
| "epoch": 0.02608695652173913, |
| "grad_norm": 3.472921848297119, |
| "kl": 0.000431060791015625, |
| "learning_rate": 9.869565217391304e-07, |
| "loss": 0.0, |
| "reward": 1.732135534286499, |
| "reward_std": 0.41073521971702576, |
| "rewards/correctness_reward_vllm": 0.7576562762260437, |
| "rewards/length_reward": 0.9744791984558105, |
| "step": 3 |
| }, |
| { |
| "completion_length": 203.53646850585938, |
| "epoch": 0.034782608695652174, |
| "grad_norm": 3.1970794200897217, |
| "kl": 0.0004558563232421875, |
| "learning_rate": 9.826086956521739e-07, |
| "loss": 0.0, |
| "reward": 1.8510417938232422, |
| "reward_std": 0.3472898006439209, |
| "rewards/correctness_reward_vllm": 0.8645833730697632, |
| "rewards/length_reward": 0.9864583015441895, |
| "step": 4 |
| }, |
| { |
| "completion_length": 198.92709350585938, |
| "epoch": 0.043478260869565216, |
| "grad_norm": 3.3482213020324707, |
| "kl": 0.000408172607421875, |
| "learning_rate": 9.782608695652173e-07, |
| "loss": 0.0, |
| "reward": 1.7761459350585938, |
| "reward_std": 0.3146688938140869, |
| "rewards/correctness_reward_vllm": 0.7995834350585938, |
| "rewards/length_reward": 0.9765625, |
| "step": 5 |
| }, |
| { |
| "completion_length": 207.5, |
| "epoch": 0.05217391304347826, |
| "grad_norm": 4.089375972747803, |
| "kl": 0.00046539306640625, |
| "learning_rate": 9.73913043478261e-07, |
| "loss": 0.0, |
| "reward": 1.8823437690734863, |
| "reward_std": 0.3251632750034332, |
| "rewards/correctness_reward_vllm": 0.8891145586967468, |
| "rewards/length_reward": 0.9932291507720947, |
| "step": 6 |
| }, |
| { |
| "completion_length": 203.44271850585938, |
| "epoch": 0.06086956521739131, |
| "grad_norm": 3.0994174480438232, |
| "kl": 0.00042724609375, |
| "learning_rate": 9.695652173913042e-07, |
| "loss": 0.0, |
| "reward": 1.8064062595367432, |
| "reward_std": 0.3677879273891449, |
| "rewards/correctness_reward_vllm": 0.8230729103088379, |
| "rewards/length_reward": 0.9833333492279053, |
| "step": 7 |
| }, |
| { |
| "completion_length": 198.77084350585938, |
| "epoch": 0.06956521739130435, |
| "grad_norm": 2.4492053985595703, |
| "kl": 0.00041961669921875, |
| "learning_rate": 9.652173913043478e-07, |
| "loss": 0.0, |
| "reward": 1.8719271421432495, |
| "reward_std": 0.3530654311180115, |
| "rewards/correctness_reward_vllm": 0.8927604556083679, |
| "rewards/length_reward": 0.9791666865348816, |
| "step": 8 |
| }, |
| { |
| "completion_length": 205.6510467529297, |
| "epoch": 0.0782608695652174, |
| "grad_norm": 2.360391855239868, |
| "kl": 0.00041961669921875, |
| "learning_rate": 9.608695652173912e-07, |
| "loss": 0.0, |
| "reward": 1.8904168605804443, |
| "reward_std": 0.36921584606170654, |
| "rewards/correctness_reward_vllm": 0.8904166221618652, |
| "rewards/length_reward": 1.0, |
| "step": 9 |
| }, |
| { |
| "completion_length": 205.4791717529297, |
| "epoch": 0.08695652173913043, |
| "grad_norm": 3.7472403049468994, |
| "kl": 0.000499725341796875, |
| "learning_rate": 9.565217391304349e-07, |
| "loss": 0.0, |
| "reward": 1.8460416793823242, |
| "reward_std": 0.3746611475944519, |
| "rewards/correctness_reward_vllm": 0.8543750643730164, |
| "rewards/length_reward": 0.9916666746139526, |
| "step": 10 |
| }, |
| { |
| "completion_length": 199.20834350585938, |
| "epoch": 0.09565217391304348, |
| "grad_norm": 3.843543767929077, |
| "kl": 0.00051116943359375, |
| "learning_rate": 9.521739130434783e-07, |
| "loss": 0.0, |
| "reward": 1.7471874952316284, |
| "reward_std": 0.33215177059173584, |
| "rewards/correctness_reward_vllm": 0.7471875548362732, |
| "rewards/length_reward": 1.0, |
| "step": 11 |
| }, |
| { |
| "completion_length": 212.421875, |
| "epoch": 0.10434782608695652, |
| "grad_norm": 4.518510341644287, |
| "kl": 0.00049591064453125, |
| "learning_rate": 9.478260869565216e-07, |
| "loss": 0.0, |
| "reward": 1.9361979961395264, |
| "reward_std": 0.37871116399765015, |
| "rewards/correctness_reward_vllm": 0.9361978769302368, |
| "rewards/length_reward": 1.0, |
| "step": 12 |
| }, |
| { |
| "completion_length": 196.06771850585938, |
| "epoch": 0.11304347826086956, |
| "grad_norm": 2.444908380508423, |
| "kl": 0.000583648681640625, |
| "learning_rate": 9.434782608695652e-07, |
| "loss": 0.0, |
| "reward": 1.8197916746139526, |
| "reward_std": 0.3508337140083313, |
| "rewards/correctness_reward_vllm": 0.8348957896232605, |
| "rewards/length_reward": 0.9848958849906921, |
| "step": 13 |
| }, |
| { |
| "completion_length": 198.2916717529297, |
| "epoch": 0.12173913043478261, |
| "grad_norm": 3.8315980434417725, |
| "kl": 0.000629425048828125, |
| "learning_rate": 9.391304347826087e-07, |
| "loss": 0.0, |
| "reward": 1.7896875143051147, |
| "reward_std": 0.3679579496383667, |
| "rewards/correctness_reward_vllm": 0.8021876215934753, |
| "rewards/length_reward": 0.9874999523162842, |
| "step": 14 |
| }, |
| { |
| "completion_length": 219.5572967529297, |
| "epoch": 0.13043478260869565, |
| "grad_norm": 3.2742035388946533, |
| "kl": 0.000690460205078125, |
| "learning_rate": 9.347826086956522e-07, |
| "loss": 0.0, |
| "reward": 1.8117709159851074, |
| "reward_std": 0.3478308320045471, |
| "rewards/correctness_reward_vllm": 0.8185417056083679, |
| "rewards/length_reward": 0.9932291507720947, |
| "step": 15 |
| }, |
| { |
| "completion_length": 200.30209350585938, |
| "epoch": 0.1391304347826087, |
| "grad_norm": 3.956608772277832, |
| "kl": 0.000659942626953125, |
| "learning_rate": 9.304347826086955e-07, |
| "loss": 0.0, |
| "reward": 1.8940625190734863, |
| "reward_std": 0.387278288602829, |
| "rewards/correctness_reward_vllm": 0.9216668009757996, |
| "rewards/length_reward": 0.9723958373069763, |
| "step": 16 |
| }, |
| { |
| "completion_length": 202.625, |
| "epoch": 0.14782608695652175, |
| "grad_norm": 4.350553512573242, |
| "kl": 0.000652313232421875, |
| "learning_rate": 9.260869565217391e-07, |
| "loss": 0.0, |
| "reward": 1.854322910308838, |
| "reward_std": 0.3680115342140198, |
| "rewards/correctness_reward_vllm": 0.8694270849227905, |
| "rewards/length_reward": 0.9848958253860474, |
| "step": 17 |
| }, |
| { |
| "completion_length": 204.4479217529297, |
| "epoch": 0.1565217391304348, |
| "grad_norm": 2.888611078262329, |
| "kl": 0.000804901123046875, |
| "learning_rate": 9.217391304347826e-07, |
| "loss": 0.0, |
| "reward": 1.8798437118530273, |
| "reward_std": 0.3778643012046814, |
| "rewards/correctness_reward_vllm": 0.8881769180297852, |
| "rewards/length_reward": 0.9916666746139526, |
| "step": 18 |
| }, |
| { |
| "completion_length": 202.02084350585938, |
| "epoch": 0.16521739130434782, |
| "grad_norm": 4.008359909057617, |
| "kl": 0.000759124755859375, |
| "learning_rate": 9.17391304347826e-07, |
| "loss": 0.0, |
| "reward": 1.7861980199813843, |
| "reward_std": 0.3587394952774048, |
| "rewards/correctness_reward_vllm": 0.7945312857627869, |
| "rewards/length_reward": 0.9916666746139526, |
| "step": 19 |
| }, |
| { |
| "completion_length": 212.41146850585938, |
| "epoch": 0.17391304347826086, |
| "grad_norm": 3.486192226409912, |
| "kl": 0.00079345703125, |
| "learning_rate": 9.130434782608695e-07, |
| "loss": 0.0, |
| "reward": 1.8601564168930054, |
| "reward_std": 0.38853877782821655, |
| "rewards/correctness_reward_vllm": 0.8736979961395264, |
| "rewards/length_reward": 0.9864583015441895, |
| "step": 20 |
| }, |
| { |
| "completion_length": 205.70834350585938, |
| "epoch": 0.1826086956521739, |
| "grad_norm": 3.196575403213501, |
| "kl": 0.00087738037109375, |
| "learning_rate": 9.08695652173913e-07, |
| "loss": 0.0, |
| "reward": 1.8731770515441895, |
| "reward_std": 0.3991861343383789, |
| "rewards/correctness_reward_vllm": 0.8773437738418579, |
| "rewards/length_reward": 0.9958333373069763, |
| "step": 21 |
| }, |
| { |
| "completion_length": 205.36459350585938, |
| "epoch": 0.19130434782608696, |
| "grad_norm": 4.234310626983643, |
| "kl": 0.0009002685546875, |
| "learning_rate": 9.043478260869564e-07, |
| "loss": 0.0, |
| "reward": 1.701145887374878, |
| "reward_std": 0.3736024796962738, |
| "rewards/correctness_reward_vllm": 0.7079167366027832, |
| "rewards/length_reward": 0.9932291507720947, |
| "step": 22 |
| }, |
| { |
| "completion_length": 220.7447967529297, |
| "epoch": 0.2, |
| "grad_norm": 6.220200061798096, |
| "kl": 0.00084686279296875, |
| "learning_rate": 9e-07, |
| "loss": 0.0, |
| "reward": 1.84416663646698, |
| "reward_std": 0.3591594696044922, |
| "rewards/correctness_reward_vllm": 0.8493751287460327, |
| "rewards/length_reward": 0.9947916865348816, |
| "step": 23 |
| }, |
| { |
| "completion_length": 200.5885467529297, |
| "epoch": 0.20869565217391303, |
| "grad_norm": 2.584165573120117, |
| "kl": 0.00090789794921875, |
| "learning_rate": 8.956521739130435e-07, |
| "loss": 0.0, |
| "reward": 1.8740625381469727, |
| "reward_std": 0.3568645417690277, |
| "rewards/correctness_reward_vllm": 0.8948957324028015, |
| "rewards/length_reward": 0.9791666865348816, |
| "step": 24 |
| }, |
| { |
| "completion_length": 210.89584350585938, |
| "epoch": 0.21739130434782608, |
| "grad_norm": 3.030820608139038, |
| "kl": 0.00096893310546875, |
| "learning_rate": 8.913043478260869e-07, |
| "loss": 0.0, |
| "reward": 1.8118228912353516, |
| "reward_std": 0.3701898455619812, |
| "rewards/correctness_reward_vllm": 0.8264062404632568, |
| "rewards/length_reward": 0.9854166507720947, |
| "step": 25 |
| }, |
| { |
| "completion_length": 208.3854217529297, |
| "epoch": 0.22608695652173913, |
| "grad_norm": 3.262943983078003, |
| "kl": 0.00106048583984375, |
| "learning_rate": 8.869565217391303e-07, |
| "loss": 0.0, |
| "reward": 1.9187500476837158, |
| "reward_std": 0.34354889392852783, |
| "rewards/correctness_reward_vllm": 0.9229166507720947, |
| "rewards/length_reward": 0.9958333373069763, |
| "step": 26 |
| }, |
| { |
| "completion_length": 193.640625, |
| "epoch": 0.23478260869565218, |
| "grad_norm": 3.750316619873047, |
| "kl": 0.00118255615234375, |
| "learning_rate": 8.826086956521739e-07, |
| "loss": 0.0, |
| "reward": 1.8126041889190674, |
| "reward_std": 0.3690093457698822, |
| "rewards/correctness_reward_vllm": 0.8292707800865173, |
| "rewards/length_reward": 0.9833333492279053, |
| "step": 27 |
| }, |
| { |
| "completion_length": 210.25521850585938, |
| "epoch": 0.24347826086956523, |
| "grad_norm": 2.774629592895508, |
| "kl": 0.00115966796875, |
| "learning_rate": 8.782608695652174e-07, |
| "loss": 0.0, |
| "reward": 1.8429688215255737, |
| "reward_std": 0.3622826039791107, |
| "rewards/correctness_reward_vllm": 0.8507812023162842, |
| "rewards/length_reward": 0.9921875, |
| "step": 28 |
| }, |
| { |
| "completion_length": 220.1354217529297, |
| "epoch": 0.25217391304347825, |
| "grad_norm": 3.555556297302246, |
| "kl": 0.0013275146484375, |
| "learning_rate": 8.739130434782607e-07, |
| "loss": 0.0001, |
| "reward": 1.793125033378601, |
| "reward_std": 0.3758922517299652, |
| "rewards/correctness_reward_vllm": 0.8051042556762695, |
| "rewards/length_reward": 0.9880208373069763, |
| "step": 29 |
| }, |
| { |
| "completion_length": 213.359375, |
| "epoch": 0.2608695652173913, |
| "grad_norm": 10.03189754486084, |
| "kl": 0.0013580322265625, |
| "learning_rate": 8.695652173913043e-07, |
| "loss": 0.0001, |
| "reward": 1.8984894752502441, |
| "reward_std": 0.37575191259384155, |
| "rewards/correctness_reward_vllm": 0.9010938405990601, |
| "rewards/length_reward": 0.9973958730697632, |
| "step": 30 |
| }, |
| { |
| "completion_length": 206.078125, |
| "epoch": 0.26956521739130435, |
| "grad_norm": 3.930840253829956, |
| "kl": 0.00125885009765625, |
| "learning_rate": 8.652173913043478e-07, |
| "loss": 0.0001, |
| "reward": 1.9381771087646484, |
| "reward_std": 0.346474826335907, |
| "rewards/correctness_reward_vllm": 0.9381771087646484, |
| "rewards/length_reward": 1.0, |
| "step": 31 |
| }, |
| { |
| "completion_length": 205.73959350585938, |
| "epoch": 0.2782608695652174, |
| "grad_norm": 2.839338779449463, |
| "kl": 0.00125885009765625, |
| "learning_rate": 8.608695652173913e-07, |
| "loss": 0.0001, |
| "reward": 1.9146875143051147, |
| "reward_std": 0.3288370370864868, |
| "rewards/correctness_reward_vllm": 0.9256248474121094, |
| "rewards/length_reward": 0.989062488079071, |
| "step": 32 |
| }, |
| { |
| "completion_length": 218.8697967529297, |
| "epoch": 0.28695652173913044, |
| "grad_norm": 2.71004056930542, |
| "kl": 0.0011749267578125, |
| "learning_rate": 8.565217391304348e-07, |
| "loss": 0.0, |
| "reward": 1.9505729675292969, |
| "reward_std": 0.36752164363861084, |
| "rewards/correctness_reward_vllm": 0.9641145467758179, |
| "rewards/length_reward": 0.9864583015441895, |
| "step": 33 |
| }, |
| { |
| "completion_length": 223.515625, |
| "epoch": 0.2956521739130435, |
| "grad_norm": 3.4277288913726807, |
| "kl": 0.001434326171875, |
| "learning_rate": 8.521739130434782e-07, |
| "loss": 0.0001, |
| "reward": 1.887239694595337, |
| "reward_std": 0.3722625970840454, |
| "rewards/correctness_reward_vllm": 0.899218738079071, |
| "rewards/length_reward": 0.9880208373069763, |
| "step": 34 |
| }, |
| { |
| "completion_length": 202.125, |
| "epoch": 0.30434782608695654, |
| "grad_norm": 4.426881790161133, |
| "kl": 0.00148773193359375, |
| "learning_rate": 8.478260869565217e-07, |
| "loss": 0.0001, |
| "reward": 1.7834895849227905, |
| "reward_std": 0.3372436463832855, |
| "rewards/correctness_reward_vllm": 0.7886979579925537, |
| "rewards/length_reward": 0.9947916865348816, |
| "step": 35 |
| }, |
| { |
| "completion_length": 211.09375, |
| "epoch": 0.3130434782608696, |
| "grad_norm": 2.230402708053589, |
| "kl": 0.0015106201171875, |
| "learning_rate": 8.434782608695652e-07, |
| "loss": 0.0001, |
| "reward": 1.8795312643051147, |
| "reward_std": 0.35857951641082764, |
| "rewards/correctness_reward_vllm": 0.8930728435516357, |
| "rewards/length_reward": 0.9864583015441895, |
| "step": 36 |
| }, |
| { |
| "completion_length": 208.34896850585938, |
| "epoch": 0.3217391304347826, |
| "grad_norm": 19.257694244384766, |
| "kl": 0.0014190673828125, |
| "learning_rate": 8.391304347826087e-07, |
| "loss": 0.0001, |
| "reward": 1.9402083158493042, |
| "reward_std": 0.3405280113220215, |
| "rewards/correctness_reward_vllm": 0.9402083158493042, |
| "rewards/length_reward": 1.0, |
| "step": 37 |
| }, |
| { |
| "completion_length": 196.92709350585938, |
| "epoch": 0.33043478260869563, |
| "grad_norm": 3.1662447452545166, |
| "kl": 0.001678466796875, |
| "learning_rate": 8.347826086956521e-07, |
| "loss": 0.0001, |
| "reward": 1.8916146755218506, |
| "reward_std": 0.35224398970603943, |
| "rewards/correctness_reward_vllm": 0.8999478220939636, |
| "rewards/length_reward": 0.9916666746139526, |
| "step": 38 |
| }, |
| { |
| "completion_length": 218.38021850585938, |
| "epoch": 0.3391304347826087, |
| "grad_norm": 2.1624081134796143, |
| "kl": 0.0016021728515625, |
| "learning_rate": 8.304347826086955e-07, |
| "loss": 0.0001, |
| "reward": 1.990781307220459, |
| "reward_std": 0.34267401695251465, |
| "rewards/correctness_reward_vllm": 1.0017187595367432, |
| "rewards/length_reward": 0.989062488079071, |
| "step": 39 |
| }, |
| { |
| "completion_length": 202.94271850585938, |
| "epoch": 0.34782608695652173, |
| "grad_norm": 3.153355598449707, |
| "kl": 0.0018310546875, |
| "learning_rate": 8.260869565217391e-07, |
| "loss": 0.0001, |
| "reward": 1.8341145515441895, |
| "reward_std": 0.3642037808895111, |
| "rewards/correctness_reward_vllm": 0.8393229246139526, |
| "rewards/length_reward": 0.9947916865348816, |
| "step": 40 |
| }, |
| { |
| "completion_length": 205.2760467529297, |
| "epoch": 0.3565217391304348, |
| "grad_norm": 3.370213747024536, |
| "kl": 0.00193023681640625, |
| "learning_rate": 8.217391304347826e-07, |
| "loss": 0.0001, |
| "reward": 1.8303645849227905, |
| "reward_std": 0.3733619153499603, |
| "rewards/correctness_reward_vllm": 0.8303646445274353, |
| "rewards/length_reward": 1.0, |
| "step": 41 |
| }, |
| { |
| "completion_length": 206.6354217529297, |
| "epoch": 0.3652173913043478, |
| "grad_norm": 7.641461372375488, |
| "kl": 0.00191497802734375, |
| "learning_rate": 8.173913043478261e-07, |
| "loss": 0.0001, |
| "reward": 1.8568229675292969, |
| "reward_std": 0.3796784281730652, |
| "rewards/correctness_reward_vllm": 0.8677603602409363, |
| "rewards/length_reward": 0.989062488079071, |
| "step": 42 |
| }, |
| { |
| "completion_length": 208.90625, |
| "epoch": 0.3739130434782609, |
| "grad_norm": 4.36552095413208, |
| "kl": 0.00179290771484375, |
| "learning_rate": 8.130434782608695e-07, |
| "loss": 0.0001, |
| "reward": 1.8441667556762695, |
| "reward_std": 0.319492369890213, |
| "rewards/correctness_reward_vllm": 0.84416663646698, |
| "rewards/length_reward": 1.0, |
| "step": 43 |
| }, |
| { |
| "completion_length": 211.84896850585938, |
| "epoch": 0.3826086956521739, |
| "grad_norm": 3.3810644149780273, |
| "kl": 0.001983642578125, |
| "learning_rate": 8.08695652173913e-07, |
| "loss": 0.0001, |
| "reward": 1.8228124380111694, |
| "reward_std": 0.39267587661743164, |
| "rewards/correctness_reward_vllm": 0.8389584422111511, |
| "rewards/length_reward": 0.9838541746139526, |
| "step": 44 |
| }, |
| { |
| "completion_length": 214.06771850585938, |
| "epoch": 0.391304347826087, |
| "grad_norm": 3.44265079498291, |
| "kl": 0.0020751953125, |
| "learning_rate": 8.043478260869565e-07, |
| "loss": 0.0001, |
| "reward": 1.7645833492279053, |
| "reward_std": 0.40830251574516296, |
| "rewards/correctness_reward_vllm": 0.7880208492279053, |
| "rewards/length_reward": 0.9765625, |
| "step": 45 |
| }, |
| { |
| "completion_length": 213.77084350585938, |
| "epoch": 0.4, |
| "grad_norm": 3.346031427383423, |
| "kl": 0.00194549560546875, |
| "learning_rate": 8e-07, |
| "loss": 0.0001, |
| "reward": 1.8528125286102295, |
| "reward_std": 0.39482760429382324, |
| "rewards/correctness_reward_vllm": 0.8637499809265137, |
| "rewards/length_reward": 0.989062488079071, |
| "step": 46 |
| }, |
| { |
| "completion_length": 209.72396850585938, |
| "epoch": 0.40869565217391307, |
| "grad_norm": 5.3741374015808105, |
| "kl": 0.0023651123046875, |
| "learning_rate": 7.956521739130434e-07, |
| "loss": 0.0001, |
| "reward": 1.8273438215255737, |
| "reward_std": 0.3434736728668213, |
| "rewards/correctness_reward_vllm": 0.8299479484558105, |
| "rewards/length_reward": 0.9973958730697632, |
| "step": 47 |
| }, |
| { |
| "completion_length": 210.171875, |
| "epoch": 0.41739130434782606, |
| "grad_norm": 2.268425941467285, |
| "kl": 0.0022125244140625, |
| "learning_rate": 7.913043478260869e-07, |
| "loss": 0.0001, |
| "reward": 1.9866665601730347, |
| "reward_std": 0.3515927195549011, |
| "rewards/correctness_reward_vllm": 1.0028126239776611, |
| "rewards/length_reward": 0.9838541746139526, |
| "step": 48 |
| }, |
| { |
| "completion_length": 214.421875, |
| "epoch": 0.4260869565217391, |
| "grad_norm": 3.0277607440948486, |
| "kl": 0.0023193359375, |
| "learning_rate": 7.869565217391305e-07, |
| "loss": 0.0001, |
| "reward": 1.8909896612167358, |
| "reward_std": 0.3666800856590271, |
| "rewards/correctness_reward_vllm": 0.8977603912353516, |
| "rewards/length_reward": 0.9932291507720947, |
| "step": 49 |
| }, |
| { |
| "completion_length": 199.6979217529297, |
| "epoch": 0.43478260869565216, |
| "grad_norm": 3.3212649822235107, |
| "kl": 0.002349853515625, |
| "learning_rate": 7.826086956521739e-07, |
| "loss": 0.0001, |
| "reward": 1.8709373474121094, |
| "reward_std": 0.381402850151062, |
| "rewards/correctness_reward_vllm": 0.8792707920074463, |
| "rewards/length_reward": 0.9916666746139526, |
| "step": 50 |
| }, |
| { |
| "completion_length": 210.41146850585938, |
| "epoch": 0.4434782608695652, |
| "grad_norm": 2.650980234146118, |
| "kl": 0.002227783203125, |
| "learning_rate": 7.782608695652173e-07, |
| "loss": 0.0001, |
| "reward": 1.9605729579925537, |
| "reward_std": 0.3226872384548187, |
| "rewards/correctness_reward_vllm": 0.9673436880111694, |
| "rewards/length_reward": 0.9932291507720947, |
| "step": 51 |
| }, |
| { |
| "completion_length": 210.3072967529297, |
| "epoch": 0.45217391304347826, |
| "grad_norm": 2.7690789699554443, |
| "kl": 0.002410888671875, |
| "learning_rate": 7.739130434782608e-07, |
| "loss": 0.0001, |
| "reward": 1.937760591506958, |
| "reward_std": 0.34728074073791504, |
| "rewards/correctness_reward_vllm": 0.9513019919395447, |
| "rewards/length_reward": 0.9864583015441895, |
| "step": 52 |
| }, |
| { |
| "completion_length": 202.38021850585938, |
| "epoch": 0.4608695652173913, |
| "grad_norm": 3.0100247859954834, |
| "kl": 0.002288818359375, |
| "learning_rate": 7.695652173913043e-07, |
| "loss": 0.0001, |
| "reward": 1.8619270324707031, |
| "reward_std": 0.3619835376739502, |
| "rewards/correctness_reward_vllm": 0.8619270324707031, |
| "rewards/length_reward": 1.0, |
| "step": 53 |
| }, |
| { |
| "completion_length": 207.6666717529297, |
| "epoch": 0.46956521739130436, |
| "grad_norm": 4.956729888916016, |
| "kl": 0.0023956298828125, |
| "learning_rate": 7.652173913043478e-07, |
| "loss": 0.0001, |
| "reward": 1.8556770086288452, |
| "reward_std": 0.3410601019859314, |
| "rewards/correctness_reward_vllm": 0.8666146993637085, |
| "rewards/length_reward": 0.989062488079071, |
| "step": 54 |
| }, |
| { |
| "completion_length": 210.65625, |
| "epoch": 0.4782608695652174, |
| "grad_norm": 6.263027191162109, |
| "kl": 0.0027008056640625, |
| "learning_rate": 7.608695652173913e-07, |
| "loss": 0.0001, |
| "reward": 1.796875, |
| "reward_std": 0.3679881989955902, |
| "rewards/correctness_reward_vllm": 0.8020833730697632, |
| "rewards/length_reward": 0.9947916865348816, |
| "step": 55 |
| }, |
| { |
| "completion_length": 209.0416717529297, |
| "epoch": 0.48695652173913045, |
| "grad_norm": 3.7194344997406006, |
| "kl": 0.002960205078125, |
| "learning_rate": 7.565217391304347e-07, |
| "loss": 0.0001, |
| "reward": 1.902083396911621, |
| "reward_std": 0.3579518496990204, |
| "rewards/correctness_reward_vllm": 0.9020832777023315, |
| "rewards/length_reward": 1.0, |
| "step": 56 |
| }, |
| { |
| "completion_length": 213.625, |
| "epoch": 0.4956521739130435, |
| "grad_norm": 7.1098246574401855, |
| "kl": 0.002716064453125, |
| "learning_rate": 7.521739130434782e-07, |
| "loss": 0.0001, |
| "reward": 1.8305728435516357, |
| "reward_std": 0.3378227949142456, |
| "rewards/correctness_reward_vllm": 0.8482812643051147, |
| "rewards/length_reward": 0.9822916984558105, |
| "step": 57 |
| }, |
| { |
| "completion_length": 210.109375, |
| "epoch": 0.5043478260869565, |
| "grad_norm": 2.48675799369812, |
| "kl": 0.0030517578125, |
| "learning_rate": 7.478260869565217e-07, |
| "loss": 0.0001, |
| "reward": 1.8633334636688232, |
| "reward_std": 0.3492564558982849, |
| "rewards/correctness_reward_vllm": 0.8659374713897705, |
| "rewards/length_reward": 0.9973958730697632, |
| "step": 58 |
| }, |
| { |
| "completion_length": 218.73959350585938, |
| "epoch": 0.5130434782608696, |
| "grad_norm": 3.753242254257202, |
| "kl": 0.0026702880859375, |
| "learning_rate": 7.434782608695653e-07, |
| "loss": 0.0001, |
| "reward": 1.9696353673934937, |
| "reward_std": 0.33653122186660767, |
| "rewards/correctness_reward_vllm": 0.9883853793144226, |
| "rewards/length_reward": 0.981249988079071, |
| "step": 59 |
| }, |
| { |
| "completion_length": 221.8072967529297, |
| "epoch": 0.5217391304347826, |
| "grad_norm": 2.6025142669677734, |
| "kl": 0.0027618408203125, |
| "learning_rate": 7.391304347826086e-07, |
| "loss": 0.0001, |
| "reward": 1.8679168224334717, |
| "reward_std": 0.3256112039089203, |
| "rewards/correctness_reward_vllm": 0.8788542151451111, |
| "rewards/length_reward": 0.989062488079071, |
| "step": 60 |
| }, |
| { |
| "completion_length": 210.16146850585938, |
| "epoch": 0.5304347826086957, |
| "grad_norm": 3.2735888957977295, |
| "kl": 0.002960205078125, |
| "learning_rate": 7.347826086956521e-07, |
| "loss": 0.0001, |
| "reward": 1.9982812404632568, |
| "reward_std": 0.36957651376724243, |
| "rewards/correctness_reward_vllm": 1.005052089691162, |
| "rewards/length_reward": 0.9932291507720947, |
| "step": 61 |
| }, |
| { |
| "completion_length": 217.3229217529297, |
| "epoch": 0.5391304347826087, |
| "grad_norm": 5.706833362579346, |
| "kl": 0.0030364990234375, |
| "learning_rate": 7.304347826086957e-07, |
| "loss": 0.0001, |
| "reward": 1.949479103088379, |
| "reward_std": 0.3874410390853882, |
| "rewards/correctness_reward_vllm": 0.9619792699813843, |
| "rewards/length_reward": 0.9875000715255737, |
| "step": 62 |
| }, |
| { |
| "completion_length": 230.703125, |
| "epoch": 0.5478260869565217, |
| "grad_norm": 2.584719657897949, |
| "kl": 0.0027923583984375, |
| "learning_rate": 7.260869565217391e-07, |
| "loss": 0.0001, |
| "reward": 1.8884896039962769, |
| "reward_std": 0.3631989657878876, |
| "rewards/correctness_reward_vllm": 0.905677080154419, |
| "rewards/length_reward": 0.9828125238418579, |
| "step": 63 |
| }, |
| { |
| "completion_length": 206.375, |
| "epoch": 0.5565217391304348, |
| "grad_norm": 4.763845443725586, |
| "kl": 0.0034332275390625, |
| "learning_rate": 7.217391304347826e-07, |
| "loss": 0.0001, |
| "reward": 1.813020944595337, |
| "reward_std": 0.36838170886039734, |
| "rewards/correctness_reward_vllm": 0.8276041746139526, |
| "rewards/length_reward": 0.9854166507720947, |
| "step": 64 |
| }, |
| { |
| "completion_length": 213.7291717529297, |
| "epoch": 0.5652173913043478, |
| "grad_norm": 2.8753910064697266, |
| "kl": 0.003021240234375, |
| "learning_rate": 7.17391304347826e-07, |
| "loss": 0.0001, |
| "reward": 1.838854193687439, |
| "reward_std": 0.3463793992996216, |
| "rewards/correctness_reward_vllm": 0.8456250429153442, |
| "rewards/length_reward": 0.9932291507720947, |
| "step": 65 |
| }, |
| { |
| "completion_length": 221.125, |
| "epoch": 0.5739130434782609, |
| "grad_norm": 5.015041828155518, |
| "kl": 0.0030670166015625, |
| "learning_rate": 7.130434782608695e-07, |
| "loss": 0.0001, |
| "reward": 1.9818229675292969, |
| "reward_std": 0.3662077784538269, |
| "rewards/correctness_reward_vllm": 0.9844271540641785, |
| "rewards/length_reward": 0.9973958730697632, |
| "step": 66 |
| }, |
| { |
| "completion_length": 214.05209350585938, |
| "epoch": 0.5826086956521739, |
| "grad_norm": 4.653159141540527, |
| "kl": 0.0031585693359375, |
| "learning_rate": 7.08695652173913e-07, |
| "loss": 0.0001, |
| "reward": 1.8708856105804443, |
| "reward_std": 0.3716661334037781, |
| "rewards/correctness_reward_vllm": 0.8859895467758179, |
| "rewards/length_reward": 0.9848958849906921, |
| "step": 67 |
| }, |
| { |
| "completion_length": 202.1197967529297, |
| "epoch": 0.591304347826087, |
| "grad_norm": 19.68362808227539, |
| "kl": 0.0040283203125, |
| "learning_rate": 7.043478260869565e-07, |
| "loss": 0.0002, |
| "reward": 1.8791146278381348, |
| "reward_std": 0.38598939776420593, |
| "rewards/correctness_reward_vllm": 0.8832812309265137, |
| "rewards/length_reward": 0.9958333373069763, |
| "step": 68 |
| }, |
| { |
| "completion_length": 219.765625, |
| "epoch": 0.6, |
| "grad_norm": 5.806125164031982, |
| "kl": 0.0033721923828125, |
| "learning_rate": 7e-07, |
| "loss": 0.0001, |
| "reward": 1.8771874904632568, |
| "reward_std": 0.38062548637390137, |
| "rewards/correctness_reward_vllm": 0.8964582681655884, |
| "rewards/length_reward": 0.9807292222976685, |
| "step": 69 |
| }, |
| { |
| "completion_length": 218.140625, |
| "epoch": 0.6086956521739131, |
| "grad_norm": 4.798632621765137, |
| "kl": 0.0035247802734375, |
| "learning_rate": 6.956521739130434e-07, |
| "loss": 0.0001, |
| "reward": 1.8987500667572021, |
| "reward_std": 0.3669140338897705, |
| "rewards/correctness_reward_vllm": 0.9081250429153442, |
| "rewards/length_reward": 0.9906250238418579, |
| "step": 70 |
| }, |
| { |
| "completion_length": 215.59896850585938, |
| "epoch": 0.6173913043478261, |
| "grad_norm": 2.9346864223480225, |
| "kl": 0.0033111572265625, |
| "learning_rate": 6.913043478260869e-07, |
| "loss": 0.0001, |
| "reward": 1.8859374523162842, |
| "reward_std": 0.3378005623817444, |
| "rewards/correctness_reward_vllm": 0.9036458730697632, |
| "rewards/length_reward": 0.9822916984558105, |
| "step": 71 |
| }, |
| { |
| "completion_length": 215.5729217529297, |
| "epoch": 0.6260869565217392, |
| "grad_norm": 14.938803672790527, |
| "kl": 0.003631591796875, |
| "learning_rate": 6.869565217391305e-07, |
| "loss": 0.0001, |
| "reward": 1.9791147708892822, |
| "reward_std": 0.3305058479309082, |
| "rewards/correctness_reward_vllm": 0.98848956823349, |
| "rewards/length_reward": 0.9906250238418579, |
| "step": 72 |
| }, |
| { |
| "completion_length": 222.48959350585938, |
| "epoch": 0.6347826086956522, |
| "grad_norm": 3.483288288116455, |
| "kl": 0.0037841796875, |
| "learning_rate": 6.826086956521738e-07, |
| "loss": 0.0002, |
| "reward": 1.9103645086288452, |
| "reward_std": 0.3817846179008484, |
| "rewards/correctness_reward_vllm": 0.9249479174613953, |
| "rewards/length_reward": 0.9854166507720947, |
| "step": 73 |
| }, |
| { |
| "completion_length": 217.9947967529297, |
| "epoch": 0.6434782608695652, |
| "grad_norm": 4.190740585327148, |
| "kl": 0.0034027099609375, |
| "learning_rate": 6.782608695652173e-07, |
| "loss": 0.0001, |
| "reward": 2.024218797683716, |
| "reward_std": 0.3620142936706543, |
| "rewards/correctness_reward_vllm": 1.0294270515441895, |
| "rewards/length_reward": 0.9947916865348816, |
| "step": 74 |
| }, |
| { |
| "completion_length": 216.03646850585938, |
| "epoch": 0.6521739130434783, |
| "grad_norm": 3.1468451023101807, |
| "kl": 0.0037994384765625, |
| "learning_rate": 6.739130434782609e-07, |
| "loss": 0.0002, |
| "reward": 1.9498958587646484, |
| "reward_std": 0.343797892332077, |
| "rewards/correctness_reward_vllm": 0.9566665887832642, |
| "rewards/length_reward": 0.9932291507720947, |
| "step": 75 |
| }, |
| { |
| "completion_length": 205.484375, |
| "epoch": 0.6608695652173913, |
| "grad_norm": 7.804642200469971, |
| "kl": 0.003814697265625, |
| "learning_rate": 6.695652173913044e-07, |
| "loss": 0.0002, |
| "reward": 1.9136979579925537, |
| "reward_std": 0.33026009798049927, |
| "rewards/correctness_reward_vllm": 0.9178646802902222, |
| "rewards/length_reward": 0.9958333373069763, |
| "step": 76 |
| }, |
| { |
| "completion_length": 209.2604217529297, |
| "epoch": 0.6695652173913044, |
| "grad_norm": 2.271778106689453, |
| "kl": 0.00347900390625, |
| "learning_rate": 6.652173913043478e-07, |
| "loss": 0.0001, |
| "reward": 2.0001044273376465, |
| "reward_std": 0.3248264789581299, |
| "rewards/correctness_reward_vllm": 1.0084375143051147, |
| "rewards/length_reward": 0.9916666746139526, |
| "step": 77 |
| }, |
| { |
| "completion_length": 215.70834350585938, |
| "epoch": 0.6782608695652174, |
| "grad_norm": 3.02213191986084, |
| "kl": 0.00341796875, |
| "learning_rate": 6.608695652173912e-07, |
| "loss": 0.0001, |
| "reward": 1.9772396087646484, |
| "reward_std": 0.3613937795162201, |
| "rewards/correctness_reward_vllm": 0.977239727973938, |
| "rewards/length_reward": 1.0, |
| "step": 78 |
| }, |
| { |
| "completion_length": 212.40625, |
| "epoch": 0.6869565217391305, |
| "grad_norm": 6.0936055183410645, |
| "kl": 0.003753662109375, |
| "learning_rate": 6.565217391304348e-07, |
| "loss": 0.0001, |
| "reward": 1.9798438549041748, |
| "reward_std": 0.3753187656402588, |
| "rewards/correctness_reward_vllm": 0.9892187118530273, |
| "rewards/length_reward": 0.9906250238418579, |
| "step": 79 |
| }, |
| { |
| "completion_length": 221.2291717529297, |
| "epoch": 0.6956521739130435, |
| "grad_norm": 17.235055923461914, |
| "kl": 0.0036163330078125, |
| "learning_rate": 6.521739130434782e-07, |
| "loss": 0.0001, |
| "reward": 1.9942710399627686, |
| "reward_std": 0.3695635199546814, |
| "rewards/correctness_reward_vllm": 1.0078125, |
| "rewards/length_reward": 0.9864583015441895, |
| "step": 80 |
| }, |
| { |
| "completion_length": 221.046875, |
| "epoch": 0.7043478260869566, |
| "grad_norm": 2.6756632328033447, |
| "kl": 0.003875732421875, |
| "learning_rate": 6.478260869565217e-07, |
| "loss": 0.0002, |
| "reward": 2.012760639190674, |
| "reward_std": 0.3683191239833832, |
| "rewards/correctness_reward_vllm": 1.015364646911621, |
| "rewards/length_reward": 0.9973958730697632, |
| "step": 81 |
| }, |
| { |
| "completion_length": 209.78646850585938, |
| "epoch": 0.7130434782608696, |
| "grad_norm": 3.258723020553589, |
| "kl": 0.003692626953125, |
| "learning_rate": 6.434782608695652e-07, |
| "loss": 0.0001, |
| "reward": 1.91057288646698, |
| "reward_std": 0.3535638451576233, |
| "rewards/correctness_reward_vllm": 0.91734379529953, |
| "rewards/length_reward": 0.9932291507720947, |
| "step": 82 |
| }, |
| { |
| "completion_length": 222.11459350585938, |
| "epoch": 0.7217391304347827, |
| "grad_norm": 3.031001329421997, |
| "kl": 0.00408935546875, |
| "learning_rate": 6.391304347826086e-07, |
| "loss": 0.0002, |
| "reward": 1.874010443687439, |
| "reward_std": 0.34764736890792847, |
| "rewards/correctness_reward_vllm": 0.8859896063804626, |
| "rewards/length_reward": 0.9880208373069763, |
| "step": 83 |
| }, |
| { |
| "completion_length": 214.171875, |
| "epoch": 0.7304347826086957, |
| "grad_norm": 3.0759449005126953, |
| "kl": 0.004058837890625, |
| "learning_rate": 6.347826086956521e-07, |
| "loss": 0.0002, |
| "reward": 1.9028127193450928, |
| "reward_std": 0.34005099534988403, |
| "rewards/correctness_reward_vllm": 0.909583330154419, |
| "rewards/length_reward": 0.9932291507720947, |
| "step": 84 |
| }, |
| { |
| "completion_length": 202.20834350585938, |
| "epoch": 0.7391304347826086, |
| "grad_norm": 2.5407440662384033, |
| "kl": 0.00421142578125, |
| "learning_rate": 6.304347826086957e-07, |
| "loss": 0.0002, |
| "reward": 1.8680729866027832, |
| "reward_std": 0.3732559084892273, |
| "rewards/correctness_reward_vllm": 0.8831771612167358, |
| "rewards/length_reward": 0.9848958849906921, |
| "step": 85 |
| }, |
| { |
| "completion_length": 219.9635467529297, |
| "epoch": 0.7478260869565218, |
| "grad_norm": 3.382174015045166, |
| "kl": 0.00457763671875, |
| "learning_rate": 6.260869565217392e-07, |
| "loss": 0.0002, |
| "reward": 1.9778646230697632, |
| "reward_std": 0.3570324778556824, |
| "rewards/correctness_reward_vllm": 0.9830728769302368, |
| "rewards/length_reward": 0.9947916865348816, |
| "step": 86 |
| }, |
| { |
| "completion_length": 208.31771850585938, |
| "epoch": 0.7565217391304347, |
| "grad_norm": 4.980509281158447, |
| "kl": 0.0042724609375, |
| "learning_rate": 6.217391304347825e-07, |
| "loss": 0.0002, |
| "reward": 1.9402084350585938, |
| "reward_std": 0.3678310811519623, |
| "rewards/correctness_reward_vllm": 0.9527082443237305, |
| "rewards/length_reward": 0.9874999523162842, |
| "step": 87 |
| }, |
| { |
| "completion_length": 218.546875, |
| "epoch": 0.7652173913043478, |
| "grad_norm": 6.500480651855469, |
| "kl": 0.004241943359375, |
| "learning_rate": 6.17391304347826e-07, |
| "loss": 0.0002, |
| "reward": 1.88979172706604, |
| "reward_std": 0.36772656440734863, |
| "rewards/correctness_reward_vllm": 0.9017707705497742, |
| "rewards/length_reward": 0.9880208373069763, |
| "step": 88 |
| }, |
| { |
| "completion_length": 215.0416717529297, |
| "epoch": 0.7739130434782608, |
| "grad_norm": 2.6303582191467285, |
| "kl": 0.004150390625, |
| "learning_rate": 6.130434782608696e-07, |
| "loss": 0.0002, |
| "reward": 1.9179166555404663, |
| "reward_std": 0.39331650733947754, |
| "rewards/correctness_reward_vllm": 0.9246875643730164, |
| "rewards/length_reward": 0.9932291507720947, |
| "step": 89 |
| }, |
| { |
| "completion_length": 210.0729217529297, |
| "epoch": 0.782608695652174, |
| "grad_norm": 3.609468460083008, |
| "kl": 0.00439453125, |
| "learning_rate": 6.08695652173913e-07, |
| "loss": 0.0002, |
| "reward": 1.836041808128357, |
| "reward_std": 0.32552263140678406, |
| "rewards/correctness_reward_vllm": 0.838645875453949, |
| "rewards/length_reward": 0.9973958730697632, |
| "step": 90 |
| }, |
| { |
| "completion_length": 209.734375, |
| "epoch": 0.7913043478260869, |
| "grad_norm": 6.067314624786377, |
| "kl": 0.004547119140625, |
| "learning_rate": 6.043478260869564e-07, |
| "loss": 0.0002, |
| "reward": 1.9498958587646484, |
| "reward_std": 0.34004995226860046, |
| "rewards/correctness_reward_vllm": 0.9540625810623169, |
| "rewards/length_reward": 0.9958333373069763, |
| "step": 91 |
| }, |
| { |
| "completion_length": 210.1510467529297, |
| "epoch": 0.8, |
| "grad_norm": 1.9421336650848389, |
| "kl": 0.00457763671875, |
| "learning_rate": 6e-07, |
| "loss": 0.0002, |
| "reward": 1.9082813262939453, |
| "reward_std": 0.3142109513282776, |
| "rewards/correctness_reward_vllm": 0.92598956823349, |
| "rewards/length_reward": 0.9822916388511658, |
| "step": 92 |
| }, |
| { |
| "completion_length": 213.8229217529297, |
| "epoch": 0.808695652173913, |
| "grad_norm": 5.914642810821533, |
| "kl": 0.004241943359375, |
| "learning_rate": 5.956521739130435e-07, |
| "loss": 0.0002, |
| "reward": 1.8783854246139526, |
| "reward_std": 0.3885143995285034, |
| "rewards/correctness_reward_vllm": 0.88776034116745, |
| "rewards/length_reward": 0.9906250238418579, |
| "step": 93 |
| }, |
| { |
| "completion_length": 215.73959350585938, |
| "epoch": 0.8173913043478261, |
| "grad_norm": 4.0509867668151855, |
| "kl": 0.004119873046875, |
| "learning_rate": 5.913043478260869e-07, |
| "loss": 0.0002, |
| "reward": 1.8941667079925537, |
| "reward_std": 0.33366817235946655, |
| "rewards/correctness_reward_vllm": 0.8941666483879089, |
| "rewards/length_reward": 1.0, |
| "step": 94 |
| }, |
| { |
| "completion_length": 212.50521850585938, |
| "epoch": 0.8260869565217391, |
| "grad_norm": 4.238688945770264, |
| "kl": 0.004608154296875, |
| "learning_rate": 5.869565217391305e-07, |
| "loss": 0.0002, |
| "reward": 1.847812533378601, |
| "reward_std": 0.3324012756347656, |
| "rewards/correctness_reward_vllm": 0.869687557220459, |
| "rewards/length_reward": 0.9781250357627869, |
| "step": 95 |
| }, |
| { |
| "completion_length": 215.03125, |
| "epoch": 0.8347826086956521, |
| "grad_norm": 2.5241854190826416, |
| "kl": 0.004180908203125, |
| "learning_rate": 5.826086956521739e-07, |
| "loss": 0.0002, |
| "reward": 1.8808854818344116, |
| "reward_std": 0.3407609462738037, |
| "rewards/correctness_reward_vllm": 0.8834896087646484, |
| "rewards/length_reward": 0.9973958730697632, |
| "step": 96 |
| }, |
| { |
| "completion_length": 212.80209350585938, |
| "epoch": 0.8434782608695652, |
| "grad_norm": 2.265584707260132, |
| "kl": 0.00408935546875, |
| "learning_rate": 5.782608695652173e-07, |
| "loss": 0.0002, |
| "reward": 1.93052077293396, |
| "reward_std": 0.36047881841659546, |
| "rewards/correctness_reward_vllm": 0.9372917413711548, |
| "rewards/length_reward": 0.9932291507720947, |
| "step": 97 |
| }, |
| { |
| "completion_length": 213.67709350585938, |
| "epoch": 0.8521739130434782, |
| "grad_norm": 4.020750522613525, |
| "kl": 0.004547119140625, |
| "learning_rate": 5.739130434782609e-07, |
| "loss": 0.0002, |
| "reward": 1.8923959732055664, |
| "reward_std": 0.3393206000328064, |
| "rewards/correctness_reward_vllm": 0.8950001001358032, |
| "rewards/length_reward": 0.9973958730697632, |
| "step": 98 |
| }, |
| { |
| "completion_length": 207.59896850585938, |
| "epoch": 0.8608695652173913, |
| "grad_norm": 14.120591163635254, |
| "kl": 0.004302978515625, |
| "learning_rate": 5.695652173913044e-07, |
| "loss": 0.0002, |
| "reward": 2.019479274749756, |
| "reward_std": 0.34421300888061523, |
| "rewards/correctness_reward_vllm": 1.0236458778381348, |
| "rewards/length_reward": 0.9958333373069763, |
| "step": 99 |
| }, |
| { |
| "completion_length": 210.15625, |
| "epoch": 0.8695652173913043, |
| "grad_norm": 2.4444007873535156, |
| "kl": 0.004730224609375, |
| "learning_rate": 5.652173913043477e-07, |
| "loss": 0.0002, |
| "reward": 1.8682812452316284, |
| "reward_std": 0.3285192549228668, |
| "rewards/correctness_reward_vllm": 0.8734895586967468, |
| "rewards/length_reward": 0.9947916865348816, |
| "step": 100 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 230, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|