{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8695652173913043, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 208.5885467529297, "epoch": 0.008695652173913044, "grad_norm": 2.329385757446289, "kl": 0.0, "learning_rate": 9.956521739130434e-07, "loss": 0.0, "reward": 1.7678645849227905, "reward_std": 0.3930448889732361, "rewards/correctness_reward_vllm": 0.7756770849227905, "rewards/length_reward": 0.9921875, "step": 1 }, { "completion_length": 199.15625, "epoch": 0.017391304347826087, "grad_norm": 3.4958019256591797, "kl": 0.0003414154052734375, "learning_rate": 9.91304347826087e-07, "loss": 0.0, "reward": 1.792708396911621, "reward_std": 0.37898510694503784, "rewards/correctness_reward_vllm": 0.7994792461395264, "rewards/length_reward": 0.9932291507720947, "step": 2 }, { "completion_length": 201.4166717529297, "epoch": 0.02608695652173913, "grad_norm": 3.472921848297119, "kl": 0.000431060791015625, "learning_rate": 9.869565217391304e-07, "loss": 0.0, "reward": 1.732135534286499, "reward_std": 0.41073521971702576, "rewards/correctness_reward_vllm": 0.7576562762260437, "rewards/length_reward": 0.9744791984558105, "step": 3 }, { "completion_length": 203.53646850585938, "epoch": 0.034782608695652174, "grad_norm": 3.1970794200897217, "kl": 0.0004558563232421875, "learning_rate": 9.826086956521739e-07, "loss": 0.0, "reward": 1.8510417938232422, "reward_std": 0.3472898006439209, "rewards/correctness_reward_vllm": 0.8645833730697632, "rewards/length_reward": 0.9864583015441895, "step": 4 }, { "completion_length": 198.92709350585938, "epoch": 0.043478260869565216, "grad_norm": 3.3482213020324707, "kl": 0.000408172607421875, "learning_rate": 9.782608695652173e-07, "loss": 0.0, "reward": 1.7761459350585938, "reward_std": 0.3146688938140869, "rewards/correctness_reward_vllm": 0.7995834350585938, "rewards/length_reward": 0.9765625, "step": 5 }, { "completion_length": 207.5, "epoch": 0.05217391304347826, "grad_norm": 4.089375972747803, "kl": 0.00046539306640625, "learning_rate": 9.73913043478261e-07, "loss": 0.0, "reward": 1.8823437690734863, "reward_std": 0.3251632750034332, "rewards/correctness_reward_vllm": 0.8891145586967468, "rewards/length_reward": 0.9932291507720947, "step": 6 }, { "completion_length": 203.44271850585938, "epoch": 0.06086956521739131, "grad_norm": 3.0994174480438232, "kl": 0.00042724609375, "learning_rate": 9.695652173913042e-07, "loss": 0.0, "reward": 1.8064062595367432, "reward_std": 0.3677879273891449, "rewards/correctness_reward_vllm": 0.8230729103088379, "rewards/length_reward": 0.9833333492279053, "step": 7 }, { "completion_length": 198.77084350585938, "epoch": 0.06956521739130435, "grad_norm": 2.4492053985595703, "kl": 0.00041961669921875, "learning_rate": 9.652173913043478e-07, "loss": 0.0, "reward": 1.8719271421432495, "reward_std": 0.3530654311180115, "rewards/correctness_reward_vllm": 0.8927604556083679, "rewards/length_reward": 0.9791666865348816, "step": 8 }, { "completion_length": 205.6510467529297, "epoch": 0.0782608695652174, "grad_norm": 2.360391855239868, "kl": 0.00041961669921875, "learning_rate": 9.608695652173912e-07, "loss": 0.0, "reward": 1.8904168605804443, "reward_std": 0.36921584606170654, "rewards/correctness_reward_vllm": 0.8904166221618652, "rewards/length_reward": 1.0, "step": 9 }, { "completion_length": 205.4791717529297, "epoch": 0.08695652173913043, "grad_norm": 3.7472403049468994, "kl": 0.000499725341796875, "learning_rate": 9.565217391304349e-07, "loss": 0.0, "reward": 1.8460416793823242, "reward_std": 0.3746611475944519, "rewards/correctness_reward_vllm": 0.8543750643730164, "rewards/length_reward": 0.9916666746139526, "step": 10 }, { "completion_length": 199.20834350585938, "epoch": 0.09565217391304348, "grad_norm": 3.843543767929077, "kl": 0.00051116943359375, "learning_rate": 9.521739130434783e-07, "loss": 0.0, "reward": 1.7471874952316284, "reward_std": 0.33215177059173584, "rewards/correctness_reward_vllm": 0.7471875548362732, "rewards/length_reward": 1.0, "step": 11 }, { "completion_length": 212.421875, "epoch": 0.10434782608695652, "grad_norm": 4.518510341644287, "kl": 0.00049591064453125, "learning_rate": 9.478260869565216e-07, "loss": 0.0, "reward": 1.9361979961395264, "reward_std": 0.37871116399765015, "rewards/correctness_reward_vllm": 0.9361978769302368, "rewards/length_reward": 1.0, "step": 12 }, { "completion_length": 196.06771850585938, "epoch": 0.11304347826086956, "grad_norm": 2.444908380508423, "kl": 0.000583648681640625, "learning_rate": 9.434782608695652e-07, "loss": 0.0, "reward": 1.8197916746139526, "reward_std": 0.3508337140083313, "rewards/correctness_reward_vllm": 0.8348957896232605, "rewards/length_reward": 0.9848958849906921, "step": 13 }, { "completion_length": 198.2916717529297, "epoch": 0.12173913043478261, "grad_norm": 3.8315980434417725, "kl": 0.000629425048828125, "learning_rate": 9.391304347826087e-07, "loss": 0.0, "reward": 1.7896875143051147, "reward_std": 0.3679579496383667, "rewards/correctness_reward_vllm": 0.8021876215934753, "rewards/length_reward": 0.9874999523162842, "step": 14 }, { "completion_length": 219.5572967529297, "epoch": 0.13043478260869565, "grad_norm": 3.2742035388946533, "kl": 0.000690460205078125, "learning_rate": 9.347826086956522e-07, "loss": 0.0, "reward": 1.8117709159851074, "reward_std": 0.3478308320045471, "rewards/correctness_reward_vllm": 0.8185417056083679, "rewards/length_reward": 0.9932291507720947, "step": 15 }, { "completion_length": 200.30209350585938, "epoch": 0.1391304347826087, "grad_norm": 3.956608772277832, "kl": 0.000659942626953125, "learning_rate": 9.304347826086955e-07, "loss": 0.0, "reward": 1.8940625190734863, "reward_std": 0.387278288602829, "rewards/correctness_reward_vllm": 0.9216668009757996, "rewards/length_reward": 0.9723958373069763, "step": 16 }, { "completion_length": 202.625, "epoch": 0.14782608695652175, "grad_norm": 4.350553512573242, "kl": 0.000652313232421875, "learning_rate": 9.260869565217391e-07, "loss": 0.0, "reward": 1.854322910308838, "reward_std": 0.3680115342140198, "rewards/correctness_reward_vllm": 0.8694270849227905, "rewards/length_reward": 0.9848958253860474, "step": 17 }, { "completion_length": 204.4479217529297, "epoch": 0.1565217391304348, "grad_norm": 2.888611078262329, "kl": 0.000804901123046875, "learning_rate": 9.217391304347826e-07, "loss": 0.0, "reward": 1.8798437118530273, "reward_std": 0.3778643012046814, "rewards/correctness_reward_vllm": 0.8881769180297852, "rewards/length_reward": 0.9916666746139526, "step": 18 }, { "completion_length": 202.02084350585938, "epoch": 0.16521739130434782, "grad_norm": 4.008359909057617, "kl": 0.000759124755859375, "learning_rate": 9.17391304347826e-07, "loss": 0.0, "reward": 1.7861980199813843, "reward_std": 0.3587394952774048, "rewards/correctness_reward_vllm": 0.7945312857627869, "rewards/length_reward": 0.9916666746139526, "step": 19 }, { "completion_length": 212.41146850585938, "epoch": 0.17391304347826086, "grad_norm": 3.486192226409912, "kl": 0.00079345703125, "learning_rate": 9.130434782608695e-07, "loss": 0.0, "reward": 1.8601564168930054, "reward_std": 0.38853877782821655, "rewards/correctness_reward_vllm": 0.8736979961395264, "rewards/length_reward": 0.9864583015441895, "step": 20 }, { "completion_length": 205.70834350585938, "epoch": 0.1826086956521739, "grad_norm": 3.196575403213501, "kl": 0.00087738037109375, "learning_rate": 9.08695652173913e-07, "loss": 0.0, "reward": 1.8731770515441895, "reward_std": 0.3991861343383789, "rewards/correctness_reward_vllm": 0.8773437738418579, "rewards/length_reward": 0.9958333373069763, "step": 21 }, { "completion_length": 205.36459350585938, "epoch": 0.19130434782608696, "grad_norm": 4.234310626983643, "kl": 0.0009002685546875, "learning_rate": 9.043478260869564e-07, "loss": 0.0, "reward": 1.701145887374878, "reward_std": 0.3736024796962738, "rewards/correctness_reward_vllm": 0.7079167366027832, "rewards/length_reward": 0.9932291507720947, "step": 22 }, { "completion_length": 220.7447967529297, "epoch": 0.2, "grad_norm": 6.220200061798096, "kl": 0.00084686279296875, "learning_rate": 9e-07, "loss": 0.0, "reward": 1.84416663646698, "reward_std": 0.3591594696044922, "rewards/correctness_reward_vllm": 0.8493751287460327, "rewards/length_reward": 0.9947916865348816, "step": 23 }, { "completion_length": 200.5885467529297, "epoch": 0.20869565217391303, "grad_norm": 2.584165573120117, "kl": 0.00090789794921875, "learning_rate": 8.956521739130435e-07, "loss": 0.0, "reward": 1.8740625381469727, "reward_std": 0.3568645417690277, "rewards/correctness_reward_vllm": 0.8948957324028015, "rewards/length_reward": 0.9791666865348816, "step": 24 }, { "completion_length": 210.89584350585938, "epoch": 0.21739130434782608, "grad_norm": 3.030820608139038, "kl": 0.00096893310546875, "learning_rate": 8.913043478260869e-07, "loss": 0.0, "reward": 1.8118228912353516, "reward_std": 0.3701898455619812, "rewards/correctness_reward_vllm": 0.8264062404632568, "rewards/length_reward": 0.9854166507720947, "step": 25 }, { "completion_length": 208.3854217529297, "epoch": 0.22608695652173913, "grad_norm": 3.262943983078003, "kl": 0.00106048583984375, "learning_rate": 8.869565217391303e-07, "loss": 0.0, "reward": 1.9187500476837158, "reward_std": 0.34354889392852783, "rewards/correctness_reward_vllm": 0.9229166507720947, "rewards/length_reward": 0.9958333373069763, "step": 26 }, { "completion_length": 193.640625, "epoch": 0.23478260869565218, "grad_norm": 3.750316619873047, "kl": 0.00118255615234375, "learning_rate": 8.826086956521739e-07, "loss": 0.0, "reward": 1.8126041889190674, "reward_std": 0.3690093457698822, "rewards/correctness_reward_vllm": 0.8292707800865173, "rewards/length_reward": 0.9833333492279053, "step": 27 }, { "completion_length": 210.25521850585938, "epoch": 0.24347826086956523, "grad_norm": 2.774629592895508, "kl": 0.00115966796875, "learning_rate": 8.782608695652174e-07, "loss": 0.0, "reward": 1.8429688215255737, "reward_std": 0.3622826039791107, "rewards/correctness_reward_vllm": 0.8507812023162842, "rewards/length_reward": 0.9921875, "step": 28 }, { "completion_length": 220.1354217529297, "epoch": 0.25217391304347825, "grad_norm": 3.555556297302246, "kl": 0.0013275146484375, "learning_rate": 8.739130434782607e-07, "loss": 0.0001, "reward": 1.793125033378601, "reward_std": 0.3758922517299652, "rewards/correctness_reward_vllm": 0.8051042556762695, "rewards/length_reward": 0.9880208373069763, "step": 29 }, { "completion_length": 213.359375, "epoch": 0.2608695652173913, "grad_norm": 10.03189754486084, "kl": 0.0013580322265625, "learning_rate": 8.695652173913043e-07, "loss": 0.0001, "reward": 1.8984894752502441, "reward_std": 0.37575191259384155, "rewards/correctness_reward_vllm": 0.9010938405990601, "rewards/length_reward": 0.9973958730697632, "step": 30 }, { "completion_length": 206.078125, "epoch": 0.26956521739130435, "grad_norm": 3.930840253829956, "kl": 0.00125885009765625, "learning_rate": 8.652173913043478e-07, "loss": 0.0001, "reward": 1.9381771087646484, "reward_std": 0.346474826335907, "rewards/correctness_reward_vllm": 0.9381771087646484, "rewards/length_reward": 1.0, "step": 31 }, { "completion_length": 205.73959350585938, "epoch": 0.2782608695652174, "grad_norm": 2.839338779449463, "kl": 0.00125885009765625, "learning_rate": 8.608695652173913e-07, "loss": 0.0001, "reward": 1.9146875143051147, "reward_std": 0.3288370370864868, "rewards/correctness_reward_vllm": 0.9256248474121094, "rewards/length_reward": 0.989062488079071, "step": 32 }, { "completion_length": 218.8697967529297, "epoch": 0.28695652173913044, "grad_norm": 2.71004056930542, "kl": 0.0011749267578125, "learning_rate": 8.565217391304348e-07, "loss": 0.0, "reward": 1.9505729675292969, "reward_std": 0.36752164363861084, "rewards/correctness_reward_vllm": 0.9641145467758179, "rewards/length_reward": 0.9864583015441895, "step": 33 }, { "completion_length": 223.515625, "epoch": 0.2956521739130435, "grad_norm": 3.4277288913726807, "kl": 0.001434326171875, "learning_rate": 8.521739130434782e-07, "loss": 0.0001, "reward": 1.887239694595337, "reward_std": 0.3722625970840454, "rewards/correctness_reward_vllm": 0.899218738079071, "rewards/length_reward": 0.9880208373069763, "step": 34 }, { "completion_length": 202.125, "epoch": 0.30434782608695654, "grad_norm": 4.426881790161133, "kl": 0.00148773193359375, "learning_rate": 8.478260869565217e-07, "loss": 0.0001, "reward": 1.7834895849227905, "reward_std": 0.3372436463832855, "rewards/correctness_reward_vllm": 0.7886979579925537, "rewards/length_reward": 0.9947916865348816, "step": 35 }, { "completion_length": 211.09375, "epoch": 0.3130434782608696, "grad_norm": 2.230402708053589, "kl": 0.0015106201171875, "learning_rate": 8.434782608695652e-07, "loss": 0.0001, "reward": 1.8795312643051147, "reward_std": 0.35857951641082764, "rewards/correctness_reward_vllm": 0.8930728435516357, "rewards/length_reward": 0.9864583015441895, "step": 36 }, { "completion_length": 208.34896850585938, "epoch": 0.3217391304347826, "grad_norm": 19.257694244384766, "kl": 0.0014190673828125, "learning_rate": 8.391304347826087e-07, "loss": 0.0001, "reward": 1.9402083158493042, "reward_std": 0.3405280113220215, "rewards/correctness_reward_vllm": 0.9402083158493042, "rewards/length_reward": 1.0, "step": 37 }, { "completion_length": 196.92709350585938, "epoch": 0.33043478260869563, "grad_norm": 3.1662447452545166, "kl": 0.001678466796875, "learning_rate": 8.347826086956521e-07, "loss": 0.0001, "reward": 1.8916146755218506, "reward_std": 0.35224398970603943, "rewards/correctness_reward_vllm": 0.8999478220939636, "rewards/length_reward": 0.9916666746139526, "step": 38 }, { "completion_length": 218.38021850585938, "epoch": 0.3391304347826087, "grad_norm": 2.1624081134796143, "kl": 0.0016021728515625, "learning_rate": 8.304347826086955e-07, "loss": 0.0001, "reward": 1.990781307220459, "reward_std": 0.34267401695251465, "rewards/correctness_reward_vllm": 1.0017187595367432, "rewards/length_reward": 0.989062488079071, "step": 39 }, { "completion_length": 202.94271850585938, "epoch": 0.34782608695652173, "grad_norm": 3.153355598449707, "kl": 0.0018310546875, "learning_rate": 8.260869565217391e-07, "loss": 0.0001, "reward": 1.8341145515441895, "reward_std": 0.3642037808895111, "rewards/correctness_reward_vllm": 0.8393229246139526, "rewards/length_reward": 0.9947916865348816, "step": 40 }, { "completion_length": 205.2760467529297, "epoch": 0.3565217391304348, "grad_norm": 3.370213747024536, "kl": 0.00193023681640625, "learning_rate": 8.217391304347826e-07, "loss": 0.0001, "reward": 1.8303645849227905, "reward_std": 0.3733619153499603, "rewards/correctness_reward_vllm": 0.8303646445274353, "rewards/length_reward": 1.0, "step": 41 }, { "completion_length": 206.6354217529297, "epoch": 0.3652173913043478, "grad_norm": 7.641461372375488, "kl": 0.00191497802734375, "learning_rate": 8.173913043478261e-07, "loss": 0.0001, "reward": 1.8568229675292969, "reward_std": 0.3796784281730652, "rewards/correctness_reward_vllm": 0.8677603602409363, "rewards/length_reward": 0.989062488079071, "step": 42 }, { "completion_length": 208.90625, "epoch": 0.3739130434782609, "grad_norm": 4.36552095413208, "kl": 0.00179290771484375, "learning_rate": 8.130434782608695e-07, "loss": 0.0001, "reward": 1.8441667556762695, "reward_std": 0.319492369890213, "rewards/correctness_reward_vllm": 0.84416663646698, "rewards/length_reward": 1.0, "step": 43 }, { "completion_length": 211.84896850585938, "epoch": 0.3826086956521739, "grad_norm": 3.3810644149780273, "kl": 0.001983642578125, "learning_rate": 8.08695652173913e-07, "loss": 0.0001, "reward": 1.8228124380111694, "reward_std": 0.39267587661743164, "rewards/correctness_reward_vllm": 0.8389584422111511, "rewards/length_reward": 0.9838541746139526, "step": 44 }, { "completion_length": 214.06771850585938, "epoch": 0.391304347826087, "grad_norm": 3.44265079498291, "kl": 0.0020751953125, "learning_rate": 8.043478260869565e-07, "loss": 0.0001, "reward": 1.7645833492279053, "reward_std": 0.40830251574516296, "rewards/correctness_reward_vllm": 0.7880208492279053, "rewards/length_reward": 0.9765625, "step": 45 }, { "completion_length": 213.77084350585938, "epoch": 0.4, "grad_norm": 3.346031427383423, "kl": 0.00194549560546875, "learning_rate": 8e-07, "loss": 0.0001, "reward": 1.8528125286102295, "reward_std": 0.39482760429382324, "rewards/correctness_reward_vllm": 0.8637499809265137, "rewards/length_reward": 0.989062488079071, "step": 46 }, { "completion_length": 209.72396850585938, "epoch": 0.40869565217391307, "grad_norm": 5.3741374015808105, "kl": 0.0023651123046875, "learning_rate": 7.956521739130434e-07, "loss": 0.0001, "reward": 1.8273438215255737, "reward_std": 0.3434736728668213, "rewards/correctness_reward_vllm": 0.8299479484558105, "rewards/length_reward": 0.9973958730697632, "step": 47 }, { "completion_length": 210.171875, "epoch": 0.41739130434782606, "grad_norm": 2.268425941467285, "kl": 0.0022125244140625, "learning_rate": 7.913043478260869e-07, "loss": 0.0001, "reward": 1.9866665601730347, "reward_std": 0.3515927195549011, "rewards/correctness_reward_vllm": 1.0028126239776611, "rewards/length_reward": 0.9838541746139526, "step": 48 }, { "completion_length": 214.421875, "epoch": 0.4260869565217391, "grad_norm": 3.0277607440948486, "kl": 0.0023193359375, "learning_rate": 7.869565217391305e-07, "loss": 0.0001, "reward": 1.8909896612167358, "reward_std": 0.3666800856590271, "rewards/correctness_reward_vllm": 0.8977603912353516, "rewards/length_reward": 0.9932291507720947, "step": 49 }, { "completion_length": 199.6979217529297, "epoch": 0.43478260869565216, "grad_norm": 3.3212649822235107, "kl": 0.002349853515625, "learning_rate": 7.826086956521739e-07, "loss": 0.0001, "reward": 1.8709373474121094, "reward_std": 0.381402850151062, "rewards/correctness_reward_vllm": 0.8792707920074463, "rewards/length_reward": 0.9916666746139526, "step": 50 }, { "completion_length": 210.41146850585938, "epoch": 0.4434782608695652, "grad_norm": 2.650980234146118, "kl": 0.002227783203125, "learning_rate": 7.782608695652173e-07, "loss": 0.0001, "reward": 1.9605729579925537, "reward_std": 0.3226872384548187, "rewards/correctness_reward_vllm": 0.9673436880111694, "rewards/length_reward": 0.9932291507720947, "step": 51 }, { "completion_length": 210.3072967529297, "epoch": 0.45217391304347826, "grad_norm": 2.7690789699554443, "kl": 0.002410888671875, "learning_rate": 7.739130434782608e-07, "loss": 0.0001, "reward": 1.937760591506958, "reward_std": 0.34728074073791504, "rewards/correctness_reward_vllm": 0.9513019919395447, "rewards/length_reward": 0.9864583015441895, "step": 52 }, { "completion_length": 202.38021850585938, "epoch": 0.4608695652173913, "grad_norm": 3.0100247859954834, "kl": 0.002288818359375, "learning_rate": 7.695652173913043e-07, "loss": 0.0001, "reward": 1.8619270324707031, "reward_std": 0.3619835376739502, "rewards/correctness_reward_vllm": 0.8619270324707031, "rewards/length_reward": 1.0, "step": 53 }, { "completion_length": 207.6666717529297, "epoch": 0.46956521739130436, "grad_norm": 4.956729888916016, "kl": 0.0023956298828125, "learning_rate": 7.652173913043478e-07, "loss": 0.0001, "reward": 1.8556770086288452, "reward_std": 0.3410601019859314, "rewards/correctness_reward_vllm": 0.8666146993637085, "rewards/length_reward": 0.989062488079071, "step": 54 }, { "completion_length": 210.65625, "epoch": 0.4782608695652174, "grad_norm": 6.263027191162109, "kl": 0.0027008056640625, "learning_rate": 7.608695652173913e-07, "loss": 0.0001, "reward": 1.796875, "reward_std": 0.3679881989955902, "rewards/correctness_reward_vllm": 0.8020833730697632, "rewards/length_reward": 0.9947916865348816, "step": 55 }, { "completion_length": 209.0416717529297, "epoch": 0.48695652173913045, "grad_norm": 3.7194344997406006, "kl": 0.002960205078125, "learning_rate": 7.565217391304347e-07, "loss": 0.0001, "reward": 1.902083396911621, "reward_std": 0.3579518496990204, "rewards/correctness_reward_vllm": 0.9020832777023315, "rewards/length_reward": 1.0, "step": 56 }, { "completion_length": 213.625, "epoch": 0.4956521739130435, "grad_norm": 7.1098246574401855, "kl": 0.002716064453125, "learning_rate": 7.521739130434782e-07, "loss": 0.0001, "reward": 1.8305728435516357, "reward_std": 0.3378227949142456, "rewards/correctness_reward_vllm": 0.8482812643051147, "rewards/length_reward": 0.9822916984558105, "step": 57 }, { "completion_length": 210.109375, "epoch": 0.5043478260869565, "grad_norm": 2.48675799369812, "kl": 0.0030517578125, "learning_rate": 7.478260869565217e-07, "loss": 0.0001, "reward": 1.8633334636688232, "reward_std": 0.3492564558982849, "rewards/correctness_reward_vllm": 0.8659374713897705, "rewards/length_reward": 0.9973958730697632, "step": 58 }, { "completion_length": 218.73959350585938, "epoch": 0.5130434782608696, "grad_norm": 3.753242254257202, "kl": 0.0026702880859375, "learning_rate": 7.434782608695653e-07, "loss": 0.0001, "reward": 1.9696353673934937, "reward_std": 0.33653122186660767, "rewards/correctness_reward_vllm": 0.9883853793144226, "rewards/length_reward": 0.981249988079071, "step": 59 }, { "completion_length": 221.8072967529297, "epoch": 0.5217391304347826, "grad_norm": 2.6025142669677734, "kl": 0.0027618408203125, "learning_rate": 7.391304347826086e-07, "loss": 0.0001, "reward": 1.8679168224334717, "reward_std": 0.3256112039089203, "rewards/correctness_reward_vllm": 0.8788542151451111, "rewards/length_reward": 0.989062488079071, "step": 60 }, { "completion_length": 210.16146850585938, "epoch": 0.5304347826086957, "grad_norm": 3.2735888957977295, "kl": 0.002960205078125, "learning_rate": 7.347826086956521e-07, "loss": 0.0001, "reward": 1.9982812404632568, "reward_std": 0.36957651376724243, "rewards/correctness_reward_vllm": 1.005052089691162, "rewards/length_reward": 0.9932291507720947, "step": 61 }, { "completion_length": 217.3229217529297, "epoch": 0.5391304347826087, "grad_norm": 5.706833362579346, "kl": 0.0030364990234375, "learning_rate": 7.304347826086957e-07, "loss": 0.0001, "reward": 1.949479103088379, "reward_std": 0.3874410390853882, "rewards/correctness_reward_vllm": 0.9619792699813843, "rewards/length_reward": 0.9875000715255737, "step": 62 }, { "completion_length": 230.703125, "epoch": 0.5478260869565217, "grad_norm": 2.584719657897949, "kl": 0.0027923583984375, "learning_rate": 7.260869565217391e-07, "loss": 0.0001, "reward": 1.8884896039962769, "reward_std": 0.3631989657878876, "rewards/correctness_reward_vllm": 0.905677080154419, "rewards/length_reward": 0.9828125238418579, "step": 63 }, { "completion_length": 206.375, "epoch": 0.5565217391304348, "grad_norm": 4.763845443725586, "kl": 0.0034332275390625, "learning_rate": 7.217391304347826e-07, "loss": 0.0001, "reward": 1.813020944595337, "reward_std": 0.36838170886039734, "rewards/correctness_reward_vllm": 0.8276041746139526, "rewards/length_reward": 0.9854166507720947, "step": 64 }, { "completion_length": 213.7291717529297, "epoch": 0.5652173913043478, "grad_norm": 2.8753910064697266, "kl": 0.003021240234375, "learning_rate": 7.17391304347826e-07, "loss": 0.0001, "reward": 1.838854193687439, "reward_std": 0.3463793992996216, "rewards/correctness_reward_vllm": 0.8456250429153442, "rewards/length_reward": 0.9932291507720947, "step": 65 }, { "completion_length": 221.125, "epoch": 0.5739130434782609, "grad_norm": 5.015041828155518, "kl": 0.0030670166015625, "learning_rate": 7.130434782608695e-07, "loss": 0.0001, "reward": 1.9818229675292969, "reward_std": 0.3662077784538269, "rewards/correctness_reward_vllm": 0.9844271540641785, "rewards/length_reward": 0.9973958730697632, "step": 66 }, { "completion_length": 214.05209350585938, "epoch": 0.5826086956521739, "grad_norm": 4.653159141540527, "kl": 0.0031585693359375, "learning_rate": 7.08695652173913e-07, "loss": 0.0001, "reward": 1.8708856105804443, "reward_std": 0.3716661334037781, "rewards/correctness_reward_vllm": 0.8859895467758179, "rewards/length_reward": 0.9848958849906921, "step": 67 }, { "completion_length": 202.1197967529297, "epoch": 0.591304347826087, "grad_norm": 19.68362808227539, "kl": 0.0040283203125, "learning_rate": 7.043478260869565e-07, "loss": 0.0002, "reward": 1.8791146278381348, "reward_std": 0.38598939776420593, "rewards/correctness_reward_vllm": 0.8832812309265137, "rewards/length_reward": 0.9958333373069763, "step": 68 }, { "completion_length": 219.765625, "epoch": 0.6, "grad_norm": 5.806125164031982, "kl": 0.0033721923828125, "learning_rate": 7e-07, "loss": 0.0001, "reward": 1.8771874904632568, "reward_std": 0.38062548637390137, "rewards/correctness_reward_vllm": 0.8964582681655884, "rewards/length_reward": 0.9807292222976685, "step": 69 }, { "completion_length": 218.140625, "epoch": 0.6086956521739131, "grad_norm": 4.798632621765137, "kl": 0.0035247802734375, "learning_rate": 6.956521739130434e-07, "loss": 0.0001, "reward": 1.8987500667572021, "reward_std": 0.3669140338897705, "rewards/correctness_reward_vllm": 0.9081250429153442, "rewards/length_reward": 0.9906250238418579, "step": 70 }, { "completion_length": 215.59896850585938, "epoch": 0.6173913043478261, "grad_norm": 2.9346864223480225, "kl": 0.0033111572265625, "learning_rate": 6.913043478260869e-07, "loss": 0.0001, "reward": 1.8859374523162842, "reward_std": 0.3378005623817444, "rewards/correctness_reward_vllm": 0.9036458730697632, "rewards/length_reward": 0.9822916984558105, "step": 71 }, { "completion_length": 215.5729217529297, "epoch": 0.6260869565217392, "grad_norm": 14.938803672790527, "kl": 0.003631591796875, "learning_rate": 6.869565217391305e-07, "loss": 0.0001, "reward": 1.9791147708892822, "reward_std": 0.3305058479309082, "rewards/correctness_reward_vllm": 0.98848956823349, "rewards/length_reward": 0.9906250238418579, "step": 72 }, { "completion_length": 222.48959350585938, "epoch": 0.6347826086956522, "grad_norm": 3.483288288116455, "kl": 0.0037841796875, "learning_rate": 6.826086956521738e-07, "loss": 0.0002, "reward": 1.9103645086288452, "reward_std": 0.3817846179008484, "rewards/correctness_reward_vllm": 0.9249479174613953, "rewards/length_reward": 0.9854166507720947, "step": 73 }, { "completion_length": 217.9947967529297, "epoch": 0.6434782608695652, "grad_norm": 4.190740585327148, "kl": 0.0034027099609375, "learning_rate": 6.782608695652173e-07, "loss": 0.0001, "reward": 2.024218797683716, "reward_std": 0.3620142936706543, "rewards/correctness_reward_vllm": 1.0294270515441895, "rewards/length_reward": 0.9947916865348816, "step": 74 }, { "completion_length": 216.03646850585938, "epoch": 0.6521739130434783, "grad_norm": 3.1468451023101807, "kl": 0.0037994384765625, "learning_rate": 6.739130434782609e-07, "loss": 0.0002, "reward": 1.9498958587646484, "reward_std": 0.343797892332077, "rewards/correctness_reward_vllm": 0.9566665887832642, "rewards/length_reward": 0.9932291507720947, "step": 75 }, { "completion_length": 205.484375, "epoch": 0.6608695652173913, "grad_norm": 7.804642200469971, "kl": 0.003814697265625, "learning_rate": 6.695652173913044e-07, "loss": 0.0002, "reward": 1.9136979579925537, "reward_std": 0.33026009798049927, "rewards/correctness_reward_vllm": 0.9178646802902222, "rewards/length_reward": 0.9958333373069763, "step": 76 }, { "completion_length": 209.2604217529297, "epoch": 0.6695652173913044, "grad_norm": 2.271778106689453, "kl": 0.00347900390625, "learning_rate": 6.652173913043478e-07, "loss": 0.0001, "reward": 2.0001044273376465, "reward_std": 0.3248264789581299, "rewards/correctness_reward_vllm": 1.0084375143051147, "rewards/length_reward": 0.9916666746139526, "step": 77 }, { "completion_length": 215.70834350585938, "epoch": 0.6782608695652174, "grad_norm": 3.02213191986084, "kl": 0.00341796875, "learning_rate": 6.608695652173912e-07, "loss": 0.0001, "reward": 1.9772396087646484, "reward_std": 0.3613937795162201, "rewards/correctness_reward_vllm": 0.977239727973938, "rewards/length_reward": 1.0, "step": 78 }, { "completion_length": 212.40625, "epoch": 0.6869565217391305, "grad_norm": 6.0936055183410645, "kl": 0.003753662109375, "learning_rate": 6.565217391304348e-07, "loss": 0.0001, "reward": 1.9798438549041748, "reward_std": 0.3753187656402588, "rewards/correctness_reward_vllm": 0.9892187118530273, "rewards/length_reward": 0.9906250238418579, "step": 79 }, { "completion_length": 221.2291717529297, "epoch": 0.6956521739130435, "grad_norm": 17.235055923461914, "kl": 0.0036163330078125, "learning_rate": 6.521739130434782e-07, "loss": 0.0001, "reward": 1.9942710399627686, "reward_std": 0.3695635199546814, "rewards/correctness_reward_vllm": 1.0078125, "rewards/length_reward": 0.9864583015441895, "step": 80 }, { "completion_length": 221.046875, "epoch": 0.7043478260869566, "grad_norm": 2.6756632328033447, "kl": 0.003875732421875, "learning_rate": 6.478260869565217e-07, "loss": 0.0002, "reward": 2.012760639190674, "reward_std": 0.3683191239833832, "rewards/correctness_reward_vllm": 1.015364646911621, "rewards/length_reward": 0.9973958730697632, "step": 81 }, { "completion_length": 209.78646850585938, "epoch": 0.7130434782608696, "grad_norm": 3.258723020553589, "kl": 0.003692626953125, "learning_rate": 6.434782608695652e-07, "loss": 0.0001, "reward": 1.91057288646698, "reward_std": 0.3535638451576233, "rewards/correctness_reward_vllm": 0.91734379529953, "rewards/length_reward": 0.9932291507720947, "step": 82 }, { "completion_length": 222.11459350585938, "epoch": 0.7217391304347827, "grad_norm": 3.031001329421997, "kl": 0.00408935546875, "learning_rate": 6.391304347826086e-07, "loss": 0.0002, "reward": 1.874010443687439, "reward_std": 0.34764736890792847, "rewards/correctness_reward_vllm": 0.8859896063804626, "rewards/length_reward": 0.9880208373069763, "step": 83 }, { "completion_length": 214.171875, "epoch": 0.7304347826086957, "grad_norm": 3.0759449005126953, "kl": 0.004058837890625, "learning_rate": 6.347826086956521e-07, "loss": 0.0002, "reward": 1.9028127193450928, "reward_std": 0.34005099534988403, "rewards/correctness_reward_vllm": 0.909583330154419, "rewards/length_reward": 0.9932291507720947, "step": 84 }, { "completion_length": 202.20834350585938, "epoch": 0.7391304347826086, "grad_norm": 2.5407440662384033, "kl": 0.00421142578125, "learning_rate": 6.304347826086957e-07, "loss": 0.0002, "reward": 1.8680729866027832, "reward_std": 0.3732559084892273, "rewards/correctness_reward_vllm": 0.8831771612167358, "rewards/length_reward": 0.9848958849906921, "step": 85 }, { "completion_length": 219.9635467529297, "epoch": 0.7478260869565218, "grad_norm": 3.382174015045166, "kl": 0.00457763671875, "learning_rate": 6.260869565217392e-07, "loss": 0.0002, "reward": 1.9778646230697632, "reward_std": 0.3570324778556824, "rewards/correctness_reward_vllm": 0.9830728769302368, "rewards/length_reward": 0.9947916865348816, "step": 86 }, { "completion_length": 208.31771850585938, "epoch": 0.7565217391304347, "grad_norm": 4.980509281158447, "kl": 0.0042724609375, "learning_rate": 6.217391304347825e-07, "loss": 0.0002, "reward": 1.9402084350585938, "reward_std": 0.3678310811519623, "rewards/correctness_reward_vllm": 0.9527082443237305, "rewards/length_reward": 0.9874999523162842, "step": 87 }, { "completion_length": 218.546875, "epoch": 0.7652173913043478, "grad_norm": 6.500480651855469, "kl": 0.004241943359375, "learning_rate": 6.17391304347826e-07, "loss": 0.0002, "reward": 1.88979172706604, "reward_std": 0.36772656440734863, "rewards/correctness_reward_vllm": 0.9017707705497742, "rewards/length_reward": 0.9880208373069763, "step": 88 }, { "completion_length": 215.0416717529297, "epoch": 0.7739130434782608, "grad_norm": 2.6303582191467285, "kl": 0.004150390625, "learning_rate": 6.130434782608696e-07, "loss": 0.0002, "reward": 1.9179166555404663, "reward_std": 0.39331650733947754, "rewards/correctness_reward_vllm": 0.9246875643730164, "rewards/length_reward": 0.9932291507720947, "step": 89 }, { "completion_length": 210.0729217529297, "epoch": 0.782608695652174, "grad_norm": 3.609468460083008, "kl": 0.00439453125, "learning_rate": 6.08695652173913e-07, "loss": 0.0002, "reward": 1.836041808128357, "reward_std": 0.32552263140678406, "rewards/correctness_reward_vllm": 0.838645875453949, "rewards/length_reward": 0.9973958730697632, "step": 90 }, { "completion_length": 209.734375, "epoch": 0.7913043478260869, "grad_norm": 6.067314624786377, "kl": 0.004547119140625, "learning_rate": 6.043478260869564e-07, "loss": 0.0002, "reward": 1.9498958587646484, "reward_std": 0.34004995226860046, "rewards/correctness_reward_vllm": 0.9540625810623169, "rewards/length_reward": 0.9958333373069763, "step": 91 }, { "completion_length": 210.1510467529297, "epoch": 0.8, "grad_norm": 1.9421336650848389, "kl": 0.00457763671875, "learning_rate": 6e-07, "loss": 0.0002, "reward": 1.9082813262939453, "reward_std": 0.3142109513282776, "rewards/correctness_reward_vllm": 0.92598956823349, "rewards/length_reward": 0.9822916388511658, "step": 92 }, { "completion_length": 213.8229217529297, "epoch": 0.808695652173913, "grad_norm": 5.914642810821533, "kl": 0.004241943359375, "learning_rate": 5.956521739130435e-07, "loss": 0.0002, "reward": 1.8783854246139526, "reward_std": 0.3885143995285034, "rewards/correctness_reward_vllm": 0.88776034116745, "rewards/length_reward": 0.9906250238418579, "step": 93 }, { "completion_length": 215.73959350585938, "epoch": 0.8173913043478261, "grad_norm": 4.0509867668151855, "kl": 0.004119873046875, "learning_rate": 5.913043478260869e-07, "loss": 0.0002, "reward": 1.8941667079925537, "reward_std": 0.33366817235946655, "rewards/correctness_reward_vllm": 0.8941666483879089, "rewards/length_reward": 1.0, "step": 94 }, { "completion_length": 212.50521850585938, "epoch": 0.8260869565217391, "grad_norm": 4.238688945770264, "kl": 0.004608154296875, "learning_rate": 5.869565217391305e-07, "loss": 0.0002, "reward": 1.847812533378601, "reward_std": 0.3324012756347656, "rewards/correctness_reward_vllm": 0.869687557220459, "rewards/length_reward": 0.9781250357627869, "step": 95 }, { "completion_length": 215.03125, "epoch": 0.8347826086956521, "grad_norm": 2.5241854190826416, "kl": 0.004180908203125, "learning_rate": 5.826086956521739e-07, "loss": 0.0002, "reward": 1.8808854818344116, "reward_std": 0.3407609462738037, "rewards/correctness_reward_vllm": 0.8834896087646484, "rewards/length_reward": 0.9973958730697632, "step": 96 }, { "completion_length": 212.80209350585938, "epoch": 0.8434782608695652, "grad_norm": 2.265584707260132, "kl": 0.00408935546875, "learning_rate": 5.782608695652173e-07, "loss": 0.0002, "reward": 1.93052077293396, "reward_std": 0.36047881841659546, "rewards/correctness_reward_vllm": 0.9372917413711548, "rewards/length_reward": 0.9932291507720947, "step": 97 }, { "completion_length": 213.67709350585938, "epoch": 0.8521739130434782, "grad_norm": 4.020750522613525, "kl": 0.004547119140625, "learning_rate": 5.739130434782609e-07, "loss": 0.0002, "reward": 1.8923959732055664, "reward_std": 0.3393206000328064, "rewards/correctness_reward_vllm": 0.8950001001358032, "rewards/length_reward": 0.9973958730697632, "step": 98 }, { "completion_length": 207.59896850585938, "epoch": 0.8608695652173913, "grad_norm": 14.120591163635254, "kl": 0.004302978515625, "learning_rate": 5.695652173913044e-07, "loss": 0.0002, "reward": 2.019479274749756, "reward_std": 0.34421300888061523, "rewards/correctness_reward_vllm": 1.0236458778381348, "rewards/length_reward": 0.9958333373069763, "step": 99 }, { "completion_length": 210.15625, "epoch": 0.8695652173913043, "grad_norm": 2.4444007873535156, "kl": 0.004730224609375, "learning_rate": 5.652173913043477e-07, "loss": 0.0002, "reward": 1.8682812452316284, "reward_std": 0.3285192549228668, "rewards/correctness_reward_vllm": 0.8734895586967468, "rewards/length_reward": 0.9947916865348816, "step": 100 } ], "logging_steps": 1.0, "max_steps": 230, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }