{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 10, "global_step": 375, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 617.9583435058594, "epoch": 0.0026666666666666666, "grad_norm": 0.5426230430603027, "kl": 0.0, "learning_rate": 4e-08, "loss": -0.2056, "reward": 0.27083334140479565, "reward_std": 0.2350771315395832, "rewards/accuracy_reward": 0.27083334140479565, "rewards/format_reward": 0.0, "step": 1 }, { "completion_length": 669.7916870117188, "epoch": 0.005333333333333333, "grad_norm": 0.6748480796813965, "kl": 0.0, "learning_rate": 8e-08, "loss": -0.0475, "reward": 0.2083333395421505, "reward_std": 0.3881702348589897, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 2 }, { "completion_length": 896.7292022705078, "epoch": 0.008, "grad_norm": 0.4940797984600067, "kl": 0.0002243518829345703, "learning_rate": 1.2000000000000002e-07, "loss": -0.1296, "reward": 0.27083333395421505, "reward_std": 0.3842546306550503, "rewards/accuracy_reward": 0.27083333395421505, "rewards/format_reward": 0.0, "step": 3 }, { "completion_length": 823.6458587646484, "epoch": 0.010666666666666666, "grad_norm": 0.26322299242019653, "kl": 0.00017309188842773438, "learning_rate": 1.6e-07, "loss": -0.038, "reward": 0.1666666679084301, "reward_std": 0.23899272084236145, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 4 }, { "completion_length": 828.6875152587891, "epoch": 0.013333333333333334, "grad_norm": 0.15690098702907562, "kl": 0.0001386404037475586, "learning_rate": 2e-07, "loss": -0.1455, "reward": 0.10416666977107525, "reward_std": 0.1801304928958416, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 5 }, { "completion_length": 649.8333358764648, "epoch": 0.016, "grad_norm": 0.25603243708610535, "kl": 0.00013273954391479492, "learning_rate": 2.4000000000000003e-07, "loss": -0.0906, "reward": 0.2291666716337204, "reward_std": 0.24468021839857101, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 6 }, { "completion_length": 842.6458587646484, "epoch": 0.018666666666666668, "grad_norm": 0.26339226961135864, "kl": 0.00011420249938964844, "learning_rate": 2.8e-07, "loss": -0.018, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 7 }, { "completion_length": 517.8750228881836, "epoch": 0.021333333333333333, "grad_norm": 0.21927940845489502, "kl": 0.0001316070556640625, "learning_rate": 3.2e-07, "loss": 0.0214, "reward": 0.2083333358168602, "reward_std": 0.3061862252652645, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 8 }, { "completion_length": 605.8541717529297, "epoch": 0.024, "grad_norm": 0.29087916016578674, "kl": 0.00013589859008789062, "learning_rate": 3.6e-07, "loss": -0.0332, "reward": 0.20833333767950535, "reward_std": 0.38817023858428, "rewards/accuracy_reward": 0.20833333767950535, "rewards/format_reward": 0.0, "step": 9 }, { "completion_length": 787.3333435058594, "epoch": 0.02666666666666667, "grad_norm": 0.12677079439163208, "kl": 0.0001424551010131836, "learning_rate": 4e-07, "loss": -0.0164, "reward": 0.1458333395421505, "reward_std": 0.1530931033194065, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 715.2916870117188, "epoch": 0.029333333333333333, "grad_norm": 0.196958988904953, "kl": 0.00014650821685791016, "learning_rate": 4.4e-07, "loss": 0.0302, "reward": 0.12500000558793545, "reward_std": 0.18404607474803925, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 11 }, { "completion_length": 685.3125305175781, "epoch": 0.032, "grad_norm": 0.2892319858074188, "kl": 0.0001569986343383789, "learning_rate": 4.800000000000001e-07, "loss": -0.0853, "reward": 0.2083333395421505, "reward_std": 0.3881702348589897, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 12 }, { "completion_length": 687.5000305175781, "epoch": 0.034666666666666665, "grad_norm": 0.20977553725242615, "kl": 0.0001035928726196289, "learning_rate": 5.2e-07, "loss": -0.0293, "reward": 0.1250000037252903, "reward_std": 0.16661180183291435, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 13 }, { "completion_length": 721.3125152587891, "epoch": 0.037333333333333336, "grad_norm": 0.43697115778923035, "kl": 0.00019097328186035156, "learning_rate": 5.6e-07, "loss": -0.0344, "reward": 0.31250001303851604, "reward_std": 0.36417657509446144, "rewards/accuracy_reward": 0.31250001303851604, "rewards/format_reward": 0.0, "step": 14 }, { "completion_length": 815.6250152587891, "epoch": 0.04, "grad_norm": 0.30842292308807373, "kl": 0.00019025802612304688, "learning_rate": 6.000000000000001e-07, "loss": 0.0081, "reward": 0.25000000931322575, "reward_std": 0.3332235962152481, "rewards/accuracy_reward": 0.25000000931322575, "rewards/format_reward": 0.0, "step": 15 }, { "completion_length": 714.8750152587891, "epoch": 0.042666666666666665, "grad_norm": 0.15875642001628876, "kl": 0.00021457672119140625, "learning_rate": 6.4e-07, "loss": -0.069, "reward": 0.2083333358168602, "reward_std": 0.16661180183291435, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 16 }, { "completion_length": 713.6458587646484, "epoch": 0.04533333333333334, "grad_norm": 0.19242540001869202, "kl": 0.00024271011352539062, "learning_rate": 6.8e-07, "loss": 0.0165, "reward": 0.22916666977107525, "reward_std": 0.35457348451018333, "rewards/accuracy_reward": 0.22916666977107525, "rewards/format_reward": 0.0, "step": 17 }, { "completion_length": 591.5208435058594, "epoch": 0.048, "grad_norm": 0.2873741686344147, "kl": 0.00020933151245117188, "learning_rate": 7.2e-07, "loss": -0.0329, "reward": 0.22916667349636555, "reward_std": 0.40168894082307816, "rewards/accuracy_reward": 0.22916667349636555, "rewards/format_reward": 0.0, "step": 18 }, { "completion_length": 622.2083587646484, "epoch": 0.050666666666666665, "grad_norm": 0.20909227430820465, "kl": 0.00016427040100097656, "learning_rate": 7.600000000000001e-07, "loss": 0.1103, "reward": 0.27083333767950535, "reward_std": 0.2446802221238613, "rewards/accuracy_reward": 0.27083333767950535, "rewards/format_reward": 0.0, "step": 19 }, { "completion_length": 561.9166793823242, "epoch": 0.05333333333333334, "grad_norm": 1.7326514720916748, "kl": 0.003941774368286133, "learning_rate": 8e-07, "loss": -0.0138, "reward": 0.1458333358168602, "reward_std": 0.2350771278142929, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 919.6250152587891, "epoch": 0.056, "grad_norm": 0.15656666457653046, "kl": 0.0001838207244873047, "learning_rate": 8.400000000000001e-07, "loss": -0.02, "reward": 0.25000001303851604, "reward_std": 0.3332235999405384, "rewards/accuracy_reward": 0.25000001303851604, "rewards/format_reward": 0.0, "step": 21 }, { "completion_length": 662.7500267028809, "epoch": 0.058666666666666666, "grad_norm": 0.6273130178451538, "kl": 0.0004911422729492188, "learning_rate": 8.8e-07, "loss": 0.0228, "reward": 0.2291666753590107, "reward_std": 0.31970490887761116, "rewards/accuracy_reward": 0.2291666753590107, "rewards/format_reward": 0.0, "step": 22 }, { "completion_length": 724.3958511352539, "epoch": 0.06133333333333333, "grad_norm": 0.3087979257106781, "kl": 0.00020515918731689453, "learning_rate": 9.2e-07, "loss": 0.0192, "reward": 0.1458333395421505, "reward_std": 0.23507710918784142, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 23 }, { "completion_length": 584.0416870117188, "epoch": 0.064, "grad_norm": 0.4410843253135681, "kl": 0.0004787445068359375, "learning_rate": 9.600000000000001e-07, "loss": -0.0191, "reward": 0.2916666716337204, "reward_std": 0.4701542556285858, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 24 }, { "completion_length": 542.3125076293945, "epoch": 0.06666666666666667, "grad_norm": 0.6044087409973145, "kl": 0.0002796649932861328, "learning_rate": 1e-06, "loss": -0.0329, "reward": 0.27083334140479565, "reward_std": 0.3720077611505985, "rewards/accuracy_reward": 0.27083334140479565, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 769.7083587646484, "epoch": 0.06933333333333333, "grad_norm": 0.25316932797431946, "kl": 0.00023126602172851562, "learning_rate": 1.04e-06, "loss": 0.0189, "reward": 0.1666666679084301, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 26 }, { "completion_length": 767.7916870117188, "epoch": 0.072, "grad_norm": 0.1700117290019989, "kl": 0.0002732276916503906, "learning_rate": 1.08e-06, "loss": -0.0018, "reward": 0.1458333395421505, "reward_std": 0.1530931033194065, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 27 }, { "completion_length": 730.8333435058594, "epoch": 0.07466666666666667, "grad_norm": 0.3627185821533203, "kl": 0.0003504753112792969, "learning_rate": 1.12e-06, "loss": 0.0534, "reward": 0.0833333358168602, "reward_std": 0.20412414893507957, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 28 }, { "completion_length": 638.6666793823242, "epoch": 0.07733333333333334, "grad_norm": 0.23711198568344116, "kl": 0.0003572702407836914, "learning_rate": 1.16e-06, "loss": 0.0077, "reward": 0.12500000186264515, "reward_std": 0.22155842557549477, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.0, "step": 29 }, { "completion_length": 576.9375228881836, "epoch": 0.08, "grad_norm": 0.1860937625169754, "kl": 0.0005288124084472656, "learning_rate": 1.2000000000000002e-06, "loss": -0.0037, "reward": 0.12500000186264515, "reward_std": 0.18404608964920044, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 769.3958511352539, "epoch": 0.08266666666666667, "grad_norm": 0.19105114042758942, "kl": 0.0011196136474609375, "learning_rate": 1.24e-06, "loss": 0.1389, "reward": 0.4375000074505806, "reward_std": 0.43655750155448914, "rewards/accuracy_reward": 0.4375000074505806, "rewards/format_reward": 0.0, "step": 31 }, { "completion_length": 853.6042022705078, "epoch": 0.08533333333333333, "grad_norm": 0.22852593660354614, "kl": 0.0007238388061523438, "learning_rate": 1.28e-06, "loss": 0.0189, "reward": 0.1875000037252903, "reward_std": 0.33713920414447784, "rewards/accuracy_reward": 0.1875000037252903, "rewards/format_reward": 0.0, "step": 32 }, { "completion_length": 778.4791870117188, "epoch": 0.088, "grad_norm": 0.23918747901916504, "kl": 0.0009222030639648438, "learning_rate": 1.32e-06, "loss": -0.0058, "reward": 0.3958333469927311, "reward_std": 0.3816108703613281, "rewards/accuracy_reward": 0.3958333469927311, "rewards/format_reward": 0.0, "step": 33 }, { "completion_length": 704.6041793823242, "epoch": 0.09066666666666667, "grad_norm": 0.4135127365589142, "kl": 0.001522064208984375, "learning_rate": 1.36e-06, "loss": 0.1236, "reward": 0.1666666679084301, "reward_std": 0.3332235887646675, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 34 }, { "completion_length": 599.6458511352539, "epoch": 0.09333333333333334, "grad_norm": 0.3384934365749359, "kl": 0.0020999908447265625, "learning_rate": 1.4000000000000001e-06, "loss": -0.1167, "reward": 0.1666666716337204, "reward_std": 0.2861081697046757, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 35 }, { "completion_length": 932.8750305175781, "epoch": 0.096, "grad_norm": 0.32682985067367554, "kl": 0.0016765594482421875, "learning_rate": 1.44e-06, "loss": 0.0254, "reward": 0.3750000111758709, "reward_std": 0.4326418787240982, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.0, "step": 36 }, { "completion_length": 539.9583587646484, "epoch": 0.09866666666666667, "grad_norm": 1.6405223608016968, "kl": 0.003711700439453125, "learning_rate": 1.48e-06, "loss": -0.0445, "reward": 0.3541666716337204, "reward_std": 0.3816108778119087, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 37 }, { "completion_length": 570.0625228881836, "epoch": 0.10133333333333333, "grad_norm": 0.28974005579948425, "kl": 0.004131317138671875, "learning_rate": 1.5200000000000003e-06, "loss": -0.0492, "reward": 0.2500000037252903, "reward_std": 0.2957112640142441, "rewards/accuracy_reward": 0.2500000037252903, "rewards/format_reward": 0.0, "step": 38 }, { "completion_length": 653.6666946411133, "epoch": 0.104, "grad_norm": 0.15624871850013733, "kl": 0.00244903564453125, "learning_rate": 1.56e-06, "loss": -0.0274, "reward": 0.1666666716337204, "reward_std": 0.20148035883903503, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 39 }, { "completion_length": 980.7708587646484, "epoch": 0.10666666666666667, "grad_norm": 0.23032425343990326, "kl": 0.002117156982421875, "learning_rate": 1.6e-06, "loss": 0.105, "reward": 0.3958333544433117, "reward_std": 0.42872628569602966, "rewards/accuracy_reward": 0.3958333544433117, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 779.8750305175781, "epoch": 0.10933333333333334, "grad_norm": 0.19505877792835236, "kl": 0.005481719970703125, "learning_rate": 1.64e-06, "loss": 0.0268, "reward": 0.31250000558793545, "reward_std": 0.3720077611505985, "rewards/accuracy_reward": 0.31250000558793545, "rewards/format_reward": 0.0, "step": 41 }, { "completion_length": 731.0416870117188, "epoch": 0.112, "grad_norm": 0.15505914390087128, "kl": 0.004322052001953125, "learning_rate": 1.6800000000000002e-06, "loss": -0.014, "reward": 0.5000000074505806, "reward_std": 0.18404608964920044, "rewards/accuracy_reward": 0.5000000074505806, "rewards/format_reward": 0.0, "step": 42 }, { "completion_length": 688.4583587646484, "epoch": 0.11466666666666667, "grad_norm": 0.4499289393424988, "kl": 0.00275421142578125, "learning_rate": 1.72e-06, "loss": 0.0817, "reward": 0.5416666865348816, "reward_std": 0.23116152361035347, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 43 }, { "completion_length": 882.5000305175781, "epoch": 0.11733333333333333, "grad_norm": 0.12532763183116913, "kl": 0.003265380859375, "learning_rate": 1.76e-06, "loss": 0.1742, "reward": 0.3750000149011612, "reward_std": 0.3776952587068081, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "step": 44 }, { "completion_length": 962.8125305175781, "epoch": 0.12, "grad_norm": 0.11625898629426956, "kl": 0.001895904541015625, "learning_rate": 1.8e-06, "loss": 0.1109, "reward": 0.2708333395421505, "reward_std": 0.2996268458664417, "rewards/accuracy_reward": 0.2708333395421505, "rewards/format_reward": 0.0, "step": 45 }, { "completion_length": 841.7500152587891, "epoch": 0.12266666666666666, "grad_norm": 0.05917806923389435, "kl": 0.004627227783203125, "learning_rate": 1.84e-06, "loss": 0.0264, "reward": 0.1250000037252903, "reward_std": 0.12909945845603943, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 46 }, { "completion_length": 960.7500305175781, "epoch": 0.12533333333333332, "grad_norm": 0.12586216628551483, "kl": 0.0055255889892578125, "learning_rate": 1.8800000000000002e-06, "loss": 0.0527, "reward": 0.35416667722165585, "reward_std": 0.2996268607676029, "rewards/accuracy_reward": 0.35416667722165585, "rewards/format_reward": 0.0, "step": 47 }, { "completion_length": 838.5417022705078, "epoch": 0.128, "grad_norm": 0.10650404542684555, "kl": 0.00315093994140625, "learning_rate": 1.9200000000000003e-06, "loss": 0.1018, "reward": 0.3750000111758709, "reward_std": 0.24859581142663956, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.0, "step": 48 }, { "completion_length": 726.0416870117188, "epoch": 0.13066666666666665, "grad_norm": 0.11485293507575989, "kl": 0.008026123046875, "learning_rate": 1.96e-06, "loss": 0.0354, "reward": 0.47916667722165585, "reward_std": 0.38161085173487663, "rewards/accuracy_reward": 0.47916667722165585, "rewards/format_reward": 0.0, "step": 49 }, { "completion_length": 829.6250457763672, "epoch": 0.13333333333333333, "grad_norm": 0.1546422690153122, "kl": 0.0053253173828125, "learning_rate": 2e-06, "loss": 0.005, "reward": 0.6041666716337204, "reward_std": 0.37377967685461044, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 670.8333435058594, "epoch": 0.136, "grad_norm": 0.16615261137485504, "kl": 0.0036163330078125, "learning_rate": 2.0400000000000004e-06, "loss": -0.0384, "reward": 0.416666679084301, "reward_std": 0.3131455332040787, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "step": 51 }, { "completion_length": 822.1250152587891, "epoch": 0.13866666666666666, "grad_norm": 0.32312434911727905, "kl": 0.00707244873046875, "learning_rate": 2.08e-06, "loss": -0.0019, "reward": 0.2083333358168602, "reward_std": 0.30354245007038116, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 52 }, { "completion_length": 964.8541946411133, "epoch": 0.14133333333333334, "grad_norm": 0.08396324515342712, "kl": 0.003337860107421875, "learning_rate": 2.12e-06, "loss": 0.0572, "reward": 0.3958333395421505, "reward_std": 0.2446802258491516, "rewards/accuracy_reward": 0.3958333395421505, "rewards/format_reward": 0.0, "step": 53 }, { "completion_length": 1055.0417175292969, "epoch": 0.144, "grad_norm": 0.074210025370121, "kl": 0.0041351318359375, "learning_rate": 2.16e-06, "loss": 0.0675, "reward": 0.29166667349636555, "reward_std": 0.16661180183291435, "rewards/accuracy_reward": 0.29166667349636555, "rewards/format_reward": 0.0, "step": 54 }, { "completion_length": 551.2083435058594, "epoch": 0.14666666666666667, "grad_norm": 0.1495818793773651, "kl": 0.007686614990234375, "learning_rate": 2.1999999999999997e-06, "loss": 0.0052, "reward": 0.4375000149011612, "reward_std": 0.1530931144952774, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "step": 55 }, { "completion_length": 519.0833358764648, "epoch": 0.14933333333333335, "grad_norm": 0.15317903459072113, "kl": 0.0075836181640625, "learning_rate": 2.24e-06, "loss": 0.0578, "reward": 0.4375000186264515, "reward_std": 0.41912320256233215, "rewards/accuracy_reward": 0.4375000186264515, "rewards/format_reward": 0.0, "step": 56 }, { "completion_length": 972.2292175292969, "epoch": 0.152, "grad_norm": 0.11835772544145584, "kl": 0.003032684326171875, "learning_rate": 2.28e-06, "loss": 0.1167, "reward": 0.4375000149011612, "reward_std": 0.33713919669389725, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "step": 57 }, { "completion_length": 793.3958435058594, "epoch": 0.15466666666666667, "grad_norm": 0.12818466126918793, "kl": 0.002803802490234375, "learning_rate": 2.32e-06, "loss": -0.0492, "reward": 0.6250000149011612, "reward_std": 0.2686738632619381, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 58 }, { "completion_length": 668.3750152587891, "epoch": 0.15733333333333333, "grad_norm": 0.11275894194841385, "kl": 0.003559112548828125, "learning_rate": 2.36e-06, "loss": 0.0205, "reward": 0.5625000149011612, "reward_std": 0.34674228727817535, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 59 }, { "completion_length": 989.7292175292969, "epoch": 0.16, "grad_norm": 0.1831459105014801, "kl": 0.003498077392578125, "learning_rate": 2.4000000000000003e-06, "loss": 0.0121, "reward": 0.4583333432674408, "reward_std": 0.31314554065465927, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "step": 60 }, { "completion_length": 904.0833740234375, "epoch": 0.16266666666666665, "grad_norm": 0.08020555973052979, "kl": 0.00287628173828125, "learning_rate": 2.44e-06, "loss": 0.024, "reward": 0.5000000149011612, "reward_std": 0.30354243889451027, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "step": 61 }, { "completion_length": 806.1875305175781, "epoch": 0.16533333333333333, "grad_norm": 0.3277391791343689, "kl": 0.0061187744140625, "learning_rate": 2.48e-06, "loss": 0.0241, "reward": 0.5625000149011612, "reward_std": 0.33713919296860695, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 62 }, { "completion_length": 600.7500076293945, "epoch": 0.168, "grad_norm": 0.2129485160112381, "kl": 0.008419036865234375, "learning_rate": 2.52e-06, "loss": 0.115, "reward": 0.5625000223517418, "reward_std": 0.28219257295131683, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "step": 63 }, { "completion_length": 771.1458740234375, "epoch": 0.17066666666666666, "grad_norm": 0.28386905789375305, "kl": 0.00501251220703125, "learning_rate": 2.56e-06, "loss": 0.0106, "reward": 0.6041666865348816, "reward_std": 0.42872628569602966, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 64 }, { "completion_length": 778.8750228881836, "epoch": 0.17333333333333334, "grad_norm": 0.10419953614473343, "kl": 0.009777069091796875, "learning_rate": 2.6e-06, "loss": 0.0624, "reward": 0.7916666865348816, "reward_std": 0.2861081622540951, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 65 }, { "completion_length": 861.1875457763672, "epoch": 0.176, "grad_norm": 0.2341865748167038, "kl": 0.00921630859375, "learning_rate": 2.64e-06, "loss": 0.0133, "reward": 0.31250000558793545, "reward_std": 0.2996268570423126, "rewards/accuracy_reward": 0.31250000558793545, "rewards/format_reward": 0.0, "step": 66 }, { "completion_length": 694.6875305175781, "epoch": 0.17866666666666667, "grad_norm": 0.11757036298513412, "kl": 0.013874053955078125, "learning_rate": 2.68e-06, "loss": 0.0182, "reward": 0.708333358168602, "reward_std": 0.2861081659793854, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 67 }, { "completion_length": 933.8750305175781, "epoch": 0.18133333333333335, "grad_norm": 0.13943161070346832, "kl": 0.012493133544921875, "learning_rate": 2.72e-06, "loss": 0.1693, "reward": 0.41666667722165585, "reward_std": 0.2861081510782242, "rewards/accuracy_reward": 0.41666667722165585, "rewards/format_reward": 0.0, "step": 68 }, { "completion_length": 890.3333587646484, "epoch": 0.184, "grad_norm": 0.09695959836244583, "kl": 0.00414276123046875, "learning_rate": 2.7600000000000003e-06, "loss": 0.0099, "reward": 0.27083334885537624, "reward_std": 0.23507710918784142, "rewards/accuracy_reward": 0.27083334885537624, "rewards/format_reward": 0.0, "step": 69 }, { "completion_length": 740.6250305175781, "epoch": 0.18666666666666668, "grad_norm": 0.13296610116958618, "kl": 0.00751495361328125, "learning_rate": 2.8000000000000003e-06, "loss": 0.0852, "reward": 0.541666679084301, "reward_std": 0.37592337280511856, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 655.1041870117188, "epoch": 0.18933333333333333, "grad_norm": 0.11237625777721405, "kl": 0.008686065673828125, "learning_rate": 2.84e-06, "loss": 0.0895, "reward": 0.6875000298023224, "reward_std": 0.28219255805015564, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 71 }, { "completion_length": 862.8333435058594, "epoch": 0.192, "grad_norm": 0.18952777981758118, "kl": 0.0099639892578125, "learning_rate": 2.88e-06, "loss": 0.1253, "reward": 0.5833333432674408, "reward_std": 0.4230388179421425, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 72 }, { "completion_length": 639.8333587646484, "epoch": 0.19466666666666665, "grad_norm": 0.09035161137580872, "kl": 0.00690460205078125, "learning_rate": 2.9200000000000004e-06, "loss": 0.0498, "reward": 0.4166666716337204, "reward_std": 0.25642700120806694, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "step": 73 }, { "completion_length": 793.6875305175781, "epoch": 0.19733333333333333, "grad_norm": 0.10165846347808838, "kl": 0.00469970703125, "learning_rate": 2.96e-06, "loss": 0.0626, "reward": 0.6250000111758709, "reward_std": 0.31314554065465927, "rewards/accuracy_reward": 0.6250000111758709, "rewards/format_reward": 0.0, "step": 74 }, { "completion_length": 896.1250152587891, "epoch": 0.2, "grad_norm": 0.12082868069410324, "kl": 0.004482269287109375, "learning_rate": 3e-06, "loss": -0.0443, "reward": 0.3958333358168602, "reward_std": 0.28219256550073624, "rewards/accuracy_reward": 0.3958333358168602, "rewards/format_reward": 0.0, "step": 75 }, { "completion_length": 525.1875152587891, "epoch": 0.20266666666666666, "grad_norm": 0.21093720197677612, "kl": 0.01087188720703125, "learning_rate": 2.9999837537669383e-06, "loss": 0.0263, "reward": 0.604166679084301, "reward_std": 0.2996268570423126, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "step": 76 }, { "completion_length": 632.3541717529297, "epoch": 0.20533333333333334, "grad_norm": 0.09489479660987854, "kl": 0.00710296630859375, "learning_rate": 2.9999350154196726e-06, "loss": 0.0416, "reward": 0.6875000298023224, "reward_std": 0.21764283254742622, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 77 }, { "completion_length": 835.8125152587891, "epoch": 0.208, "grad_norm": 0.11504478007555008, "kl": 0.00841522216796875, "learning_rate": 2.9998537860139563e-06, "loss": 0.0233, "reward": 0.5416666865348816, "reward_std": 0.4152076058089733, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 78 }, { "completion_length": 860.5000305175781, "epoch": 0.21066666666666667, "grad_norm": 0.07925013452768326, "kl": 0.00824737548828125, "learning_rate": 2.9997400673093517e-06, "loss": 0.0732, "reward": 0.5000000111758709, "reward_std": 0.26603008806705475, "rewards/accuracy_reward": 0.5000000111758709, "rewards/format_reward": 0.0, "step": 79 }, { "completion_length": 933.6458587646484, "epoch": 0.21333333333333335, "grad_norm": 0.1141030564904213, "kl": 0.009246826171875, "learning_rate": 2.9995938617691924e-06, "loss": -0.0376, "reward": 0.2916666716337204, "reward_std": 0.20148037374019623, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 606.5416870117188, "epoch": 0.216, "grad_norm": 0.09949828684329987, "kl": 0.0077667236328125, "learning_rate": 2.9994151725605313e-06, "loss": 0.1411, "reward": 0.5416666716337204, "reward_std": 0.25642701238393784, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "step": 81 }, { "completion_length": 759.7500305175781, "epoch": 0.21866666666666668, "grad_norm": 0.11169271171092987, "kl": 0.00763702392578125, "learning_rate": 2.9992040035540708e-06, "loss": 0.0378, "reward": 0.6458333432674408, "reward_std": 0.35457348451018333, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 82 }, { "completion_length": 885.8958587646484, "epoch": 0.22133333333333333, "grad_norm": 0.09573396295309067, "kl": 0.005886077880859375, "learning_rate": 2.9989603593240777e-06, "loss": 0.1027, "reward": 0.4583333507180214, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.4583333507180214, "rewards/format_reward": 0.0, "step": 83 }, { "completion_length": 844.2291870117188, "epoch": 0.224, "grad_norm": 0.11840051412582397, "kl": 0.004993438720703125, "learning_rate": 2.9986842451482876e-06, "loss": 0.0166, "reward": 0.6458333507180214, "reward_std": 0.317061148583889, "rewards/accuracy_reward": 0.6458333507180214, "rewards/format_reward": 0.0, "step": 84 }, { "completion_length": 699.4166870117188, "epoch": 0.22666666666666666, "grad_norm": 0.575175940990448, "kl": 0.00882720947265625, "learning_rate": 2.998375667007787e-06, "loss": 0.1395, "reward": 0.5833333507180214, "reward_std": 0.24859580025076866, "rewards/accuracy_reward": 0.5833333507180214, "rewards/format_reward": 0.0, "step": 85 }, { "completion_length": 752.3333435058594, "epoch": 0.22933333333333333, "grad_norm": 0.08685300499200821, "kl": 0.00833892822265625, "learning_rate": 2.9980346315868857e-06, "loss": -0.0384, "reward": 0.3750000074505806, "reward_std": 0.18404608592391014, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.0, "step": 86 }, { "completion_length": 849.0000305175781, "epoch": 0.232, "grad_norm": 0.08769199252128601, "kl": 0.0060882568359375, "learning_rate": 2.9976611462729716e-06, "loss": -0.036, "reward": 0.3958333432674408, "reward_std": 0.309229951351881, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 87 }, { "completion_length": 692.1666946411133, "epoch": 0.23466666666666666, "grad_norm": 0.2641507387161255, "kl": 0.0601959228515625, "learning_rate": 2.997255219156351e-06, "loss": -0.0153, "reward": 0.5000000149011612, "reward_std": 0.10206207260489464, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "step": 88 }, { "completion_length": 733.9791946411133, "epoch": 0.23733333333333334, "grad_norm": 0.11071512848138809, "kl": 0.0061798095703125, "learning_rate": 2.996816859030072e-06, "loss": 0.023, "reward": 0.33333333395421505, "reward_std": 0.22155842557549477, "rewards/accuracy_reward": 0.33333333395421505, "rewards/format_reward": 0.0, "step": 89 }, { "completion_length": 776.4583587646484, "epoch": 0.24, "grad_norm": 0.08652577549219131, "kl": 0.005710601806640625, "learning_rate": 2.9963460753897363e-06, "loss": 0.0425, "reward": 0.6041666865348816, "reward_std": 0.1530931070446968, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 766.8333435058594, "epoch": 0.24266666666666667, "grad_norm": 0.13474039733409882, "kl": 0.00852203369140625, "learning_rate": 2.9958428784332913e-06, "loss": 0.0211, "reward": 0.5833333432674408, "reward_std": 0.3332235962152481, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 91 }, { "completion_length": 817.7500152587891, "epoch": 0.24533333333333332, "grad_norm": 1.9350175857543945, "kl": 0.00949859619140625, "learning_rate": 2.995307279060811e-06, "loss": 0.105, "reward": 0.4375000074505806, "reward_std": 0.2996268458664417, "rewards/accuracy_reward": 0.4375000074505806, "rewards/format_reward": 0.0, "step": 92 }, { "completion_length": 599.6875152587891, "epoch": 0.248, "grad_norm": 0.1610657274723053, "kl": 0.013885498046875, "learning_rate": 2.9947392888742567e-06, "loss": 0.0217, "reward": 0.6041666716337204, "reward_std": 0.11558075994253159, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "step": 93 }, { "completion_length": 626.25, "epoch": 0.25066666666666665, "grad_norm": 0.19253714382648468, "kl": 0.01007843017578125, "learning_rate": 2.994138920177231e-06, "loss": 0.0233, "reward": 0.583333358168602, "reward_std": 0.16661179810762405, "rewards/accuracy_reward": 0.583333358168602, "rewards/format_reward": 0.0, "step": 94 }, { "completion_length": 913.4167022705078, "epoch": 0.25333333333333335, "grad_norm": 0.2549319863319397, "kl": 0.010101318359375, "learning_rate": 2.9935061859747068e-06, "loss": 0.0697, "reward": 0.41666668467223644, "reward_std": 0.36809216812253, "rewards/accuracy_reward": 0.41666668467223644, "rewards/format_reward": 0.0, "step": 95 }, { "completion_length": 851.0416870117188, "epoch": 0.256, "grad_norm": 0.09697781503200531, "kl": 0.007923126220703125, "learning_rate": 2.9928410999727467e-06, "loss": 0.0469, "reward": 0.5416666716337204, "reward_std": 0.22155842557549477, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "step": 96 }, { "completion_length": 1129.5000305175781, "epoch": 0.25866666666666666, "grad_norm": 0.1845501810312271, "kl": 0.00640869140625, "learning_rate": 2.9921436765782077e-06, "loss": 0.0713, "reward": 0.5208333432674408, "reward_std": 0.2996268533170223, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 97 }, { "completion_length": 841.6875305175781, "epoch": 0.2613333333333333, "grad_norm": 0.1793762743473053, "kl": 0.0106964111328125, "learning_rate": 2.9914139308984264e-06, "loss": 0.0075, "reward": 0.479166679084301, "reward_std": 0.28219256550073624, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "step": 98 }, { "completion_length": 668.7291717529297, "epoch": 0.264, "grad_norm": 0.2460280954837799, "kl": 0.0189361572265625, "learning_rate": 2.9906518787408948e-06, "loss": 0.0203, "reward": 0.3958333507180214, "reward_std": 0.33713920041918755, "rewards/accuracy_reward": 0.3958333507180214, "rewards/format_reward": 0.0, "step": 99 }, { "completion_length": 811.4166870117188, "epoch": 0.26666666666666666, "grad_norm": 0.16320976614952087, "kl": 0.0098114013671875, "learning_rate": 2.989857536612915e-06, "loss": 0.0632, "reward": 0.4375000149011612, "reward_std": 0.2900237590074539, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "step": 100 }, { "completion_length": 876.5000305175781, "epoch": 0.2693333333333333, "grad_norm": 2.9081761837005615, "kl": 0.05722808837890625, "learning_rate": 2.989030921721243e-06, "loss": 0.0033, "reward": 0.5, "reward_std": 0.3602609783411026, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "step": 101 }, { "completion_length": 856.0833435058594, "epoch": 0.272, "grad_norm": 0.12382116168737411, "kl": 0.0175018310546875, "learning_rate": 2.988172051971717e-06, "loss": 0.0418, "reward": 0.4166666865348816, "reward_std": 0.20412414520978928, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.0, "step": 102 }, { "completion_length": 866.6667022705078, "epoch": 0.27466666666666667, "grad_norm": 0.10861078649759293, "kl": 0.01019287109375, "learning_rate": 2.9872809459688676e-06, "loss": 0.0183, "reward": 0.520833358168602, "reward_std": 0.33713918179273605, "rewards/accuracy_reward": 0.520833358168602, "rewards/format_reward": 0.0, "step": 103 }, { "completion_length": 757.8333587646484, "epoch": 0.2773333333333333, "grad_norm": 0.13254211843013763, "kl": 0.02797698974609375, "learning_rate": 2.986357623015516e-06, "loss": 0.0117, "reward": 0.541666679084301, "reward_std": 0.19364918768405914, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "step": 104 }, { "completion_length": 749.7500305175781, "epoch": 0.28, "grad_norm": 0.18112321197986603, "kl": 0.0172882080078125, "learning_rate": 2.9854021031123555e-06, "loss": 0.0549, "reward": 0.7291666865348816, "reward_std": 0.27258947119116783, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 105 }, { "completion_length": 610.9583511352539, "epoch": 0.2826666666666667, "grad_norm": 3.0257515907287598, "kl": 0.10125732421875, "learning_rate": 2.984414406957518e-06, "loss": 0.183, "reward": 0.3333333469927311, "reward_std": 0.2686738818883896, "rewards/accuracy_reward": 0.3333333469927311, "rewards/format_reward": 0.0, "step": 106 }, { "completion_length": 779.4375228881836, "epoch": 0.2853333333333333, "grad_norm": 0.8972102403640747, "kl": 0.0275115966796875, "learning_rate": 2.983394555946126e-06, "loss": -0.0191, "reward": 0.6250000298023224, "reward_std": 0.3131455294787884, "rewards/accuracy_reward": 0.6250000298023224, "rewards/format_reward": 0.0, "step": 107 }, { "completion_length": 613.2708587646484, "epoch": 0.288, "grad_norm": 0.301284521818161, "kl": 0.0101165771484375, "learning_rate": 2.9823425721698293e-06, "loss": 0.0303, "reward": 0.5625, "reward_std": 0.11558075994253159, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.0, "step": 108 }, { "completion_length": 808.8542022705078, "epoch": 0.2906666666666667, "grad_norm": 0.15024465322494507, "kl": 0.01076507568359375, "learning_rate": 2.9812584784163257e-06, "loss": 0.0379, "reward": 0.4791666716337204, "reward_std": 0.2996268570423126, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.0, "step": 109 }, { "completion_length": 610.6666946411133, "epoch": 0.29333333333333333, "grad_norm": 0.2403380125761032, "kl": 0.021087646484375, "learning_rate": 2.980142298168869e-06, "loss": 0.0448, "reward": 0.5416666716337204, "reward_std": 0.47975732386112213, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "step": 110 }, { "completion_length": 838.0208587646484, "epoch": 0.296, "grad_norm": 0.2263801097869873, "kl": 0.0153045654296875, "learning_rate": 2.9789940556057576e-06, "loss": -0.0202, "reward": 0.583333358168602, "reward_std": 0.3602609820663929, "rewards/accuracy_reward": 0.583333358168602, "rewards/format_reward": 0.0, "step": 111 }, { "completion_length": 993.0833435058594, "epoch": 0.2986666666666667, "grad_norm": 1.6176540851593018, "kl": 0.027130126953125, "learning_rate": 2.9778137755998135e-06, "loss": -0.023, "reward": 0.2500000074505806, "reward_std": 0.18404609709978104, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 112 }, { "completion_length": 701.2500152587891, "epoch": 0.30133333333333334, "grad_norm": 0.2460828274488449, "kl": 0.0155029296875, "learning_rate": 2.9766014837178418e-06, "loss": 0.0559, "reward": 0.6875000149011612, "reward_std": 0.2350771278142929, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 113 }, { "completion_length": 851.958366394043, "epoch": 0.304, "grad_norm": 0.1806672066450119, "kl": 0.01204681396484375, "learning_rate": 2.975357206220079e-06, "loss": 0.0004, "reward": 0.6458333432674408, "reward_std": 0.2350771203637123, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 114 }, { "completion_length": 853.6042022705078, "epoch": 0.30666666666666664, "grad_norm": 0.355673223733902, "kl": 0.0250244140625, "learning_rate": 2.97408097005962e-06, "loss": 0.1069, "reward": 0.5208333358168602, "reward_std": 0.34674229472875595, "rewards/accuracy_reward": 0.5208333358168602, "rewards/format_reward": 0.0, "step": 115 }, { "completion_length": 547.3333511352539, "epoch": 0.30933333333333335, "grad_norm": 0.09819953143596649, "kl": 0.00971221923828125, "learning_rate": 2.9727728028818388e-06, "loss": 0.0728, "reward": 0.8958333432674408, "reward_std": 0.1705273948609829, "rewards/accuracy_reward": 0.8958333432674408, "rewards/format_reward": 0.0, "step": 116 }, { "completion_length": 937.7708587646484, "epoch": 0.312, "grad_norm": 0.1053454652428627, "kl": 0.010650634765625, "learning_rate": 2.9714327330237873e-06, "loss": 0.0229, "reward": 0.5000000037252903, "reward_std": 0.12909945845603943, "rewards/accuracy_reward": 0.5000000037252903, "rewards/format_reward": 0.0, "step": 117 }, { "completion_length": 622.8958435058594, "epoch": 0.31466666666666665, "grad_norm": 0.1517428457736969, "kl": 0.013092041015625, "learning_rate": 2.970060789513582e-06, "loss": 0.0359, "reward": 0.8125000149011612, "reward_std": 0.235077116638422, "rewards/accuracy_reward": 0.8125000149011612, "rewards/format_reward": 0.0, "step": 118 }, { "completion_length": 873.0625305175781, "epoch": 0.31733333333333336, "grad_norm": 0.147451251745224, "kl": 0.016143798828125, "learning_rate": 2.968657002069774e-06, "loss": 0.0268, "reward": 0.6250000223517418, "reward_std": 0.3776952587068081, "rewards/accuracy_reward": 0.6250000223517418, "rewards/format_reward": 0.0, "step": 119 }, { "completion_length": 754.4375305175781, "epoch": 0.32, "grad_norm": 0.16505473852157593, "kl": 0.010101318359375, "learning_rate": 2.9672214011007086e-06, "loss": 0.0973, "reward": 0.6250000149011612, "reward_std": 0.4056045264005661, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 120 }, { "completion_length": 904.4791870117188, "epoch": 0.32266666666666666, "grad_norm": 0.1410188376903534, "kl": 0.01914215087890625, "learning_rate": 2.965754017703862e-06, "loss": 0.0569, "reward": 0.35416666977107525, "reward_std": 0.2525114119052887, "rewards/accuracy_reward": 0.35416666977107525, "rewards/format_reward": 0.0, "step": 121 }, { "completion_length": 792.5000152587891, "epoch": 0.3253333333333333, "grad_norm": 0.3636722266674042, "kl": 0.013519287109375, "learning_rate": 2.9642548836651712e-06, "loss": 0.0447, "reward": 0.6875000149011612, "reward_std": 0.33713918551802635, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 122 }, { "completion_length": 664.6666870117188, "epoch": 0.328, "grad_norm": 0.09268064051866531, "kl": 0.0204925537109375, "learning_rate": 2.962724031458345e-06, "loss": 0.0351, "reward": 0.6250000149011612, "reward_std": 0.12909945845603943, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 123 }, { "completion_length": 865.5208587646484, "epoch": 0.33066666666666666, "grad_norm": 0.1581239253282547, "kl": 0.0137176513671875, "learning_rate": 2.9611614942441577e-06, "loss": 0.0515, "reward": 0.2708333432674408, "reward_std": 0.2996268644928932, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 124 }, { "completion_length": 803.2500305175781, "epoch": 0.3333333333333333, "grad_norm": 0.19644340872764587, "kl": 0.02410888671875, "learning_rate": 2.959567305869736e-06, "loss": 0.0212, "reward": 0.6875000149011612, "reward_std": 0.33713921159505844, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 125 }, { "completion_length": 845.5000152587891, "epoch": 0.336, "grad_norm": 0.5170478820800781, "kl": 0.01678466796875, "learning_rate": 2.95794150086782e-06, "loss": 0.1042, "reward": 0.5208333432674408, "reward_std": 0.43655747920274734, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 126 }, { "completion_length": 817.9791870117188, "epoch": 0.33866666666666667, "grad_norm": 0.08609090745449066, "kl": 0.0124053955078125, "learning_rate": 2.956284114456018e-06, "loss": 0.065, "reward": 0.2500000074505806, "reward_std": 0.286108173429966, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 127 }, { "completion_length": 765.5000152587891, "epoch": 0.3413333333333333, "grad_norm": 0.14322727918624878, "kl": 0.0160369873046875, "learning_rate": 2.9545951825360466e-06, "loss": 0.0176, "reward": 0.604166679084301, "reward_std": 0.38161086291074753, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "step": 128 }, { "completion_length": 702.2708587646484, "epoch": 0.344, "grad_norm": 0.1747395098209381, "kl": 0.017333984375, "learning_rate": 2.9528747416929465e-06, "loss": -0.0379, "reward": 0.4583333395421505, "reward_std": 0.18404609709978104, "rewards/accuracy_reward": 0.4583333395421505, "rewards/format_reward": 0.0, "step": 129 }, { "completion_length": 650.2916870117188, "epoch": 0.3466666666666667, "grad_norm": 0.07288848608732224, "kl": 0.0129547119140625, "learning_rate": 2.951122829194296e-06, "loss": 0.0248, "reward": 0.7083333395421505, "reward_std": 0.18404608592391014, "rewards/accuracy_reward": 0.7083333395421505, "rewards/format_reward": 0.0, "step": 130 }, { "completion_length": 641.8333587646484, "epoch": 0.34933333333333333, "grad_norm": 0.23963476717472076, "kl": 0.0562896728515625, "learning_rate": 2.9493394829893994e-06, "loss": 0.009, "reward": 0.6666667014360428, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.6666667014360428, "rewards/format_reward": 0.0, "step": 131 }, { "completion_length": 847.9583587646484, "epoch": 0.352, "grad_norm": 0.15714174509048462, "kl": 0.02581787109375, "learning_rate": 2.9475247417084673e-06, "loss": -0.0092, "reward": 0.5416666865348816, "reward_std": 0.22155843675136566, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 132 }, { "completion_length": 609.7708587646484, "epoch": 0.3546666666666667, "grad_norm": 0.15627437829971313, "kl": 0.021453857421875, "learning_rate": 2.9456786446617797e-06, "loss": 0.0034, "reward": 0.5208333395421505, "reward_std": 0.2900237515568733, "rewards/accuracy_reward": 0.5208333395421505, "rewards/format_reward": 0.0, "step": 133 }, { "completion_length": 604.6250305175781, "epoch": 0.35733333333333334, "grad_norm": 0.2316051423549652, "kl": 0.0161590576171875, "learning_rate": 2.9438012318388337e-06, "loss": -0.0564, "reward": 0.6250000260770321, "reward_std": 0.23116153106093407, "rewards/accuracy_reward": 0.6250000260770321, "rewards/format_reward": 0.0, "step": 134 }, { "completion_length": 487.64585876464844, "epoch": 0.36, "grad_norm": 0.2416388839483261, "kl": 0.0159759521484375, "learning_rate": 2.9418925439074784e-06, "loss": 0.0365, "reward": 0.6458333432674408, "reward_std": 0.21764282882213593, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 135 }, { "completion_length": 736.3750152587891, "epoch": 0.3626666666666667, "grad_norm": 0.11874468624591827, "kl": 0.0135955810546875, "learning_rate": 2.9399526222130314e-06, "loss": 0.0148, "reward": 0.5625000111758709, "reward_std": 0.299626849591732, "rewards/accuracy_reward": 0.5625000111758709, "rewards/format_reward": 0.0, "step": 136 }, { "completion_length": 864.3333740234375, "epoch": 0.36533333333333334, "grad_norm": 0.21116961538791656, "kl": 0.0140228271484375, "learning_rate": 2.9379815087773864e-06, "loss": 0.0897, "reward": 0.5625000298023224, "reward_std": 0.31970490142703056, "rewards/accuracy_reward": 0.5625000298023224, "rewards/format_reward": 0.0, "step": 137 }, { "completion_length": 718.7708435058594, "epoch": 0.368, "grad_norm": 0.13117057085037231, "kl": 0.0204620361328125, "learning_rate": 2.9359792462981008e-06, "loss": -0.0376, "reward": 0.7083333432674408, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 138 }, { "completion_length": 740.6875305175781, "epoch": 0.37066666666666664, "grad_norm": 0.09841669350862503, "kl": 0.011810302734375, "learning_rate": 2.9339458781474724e-06, "loss": 0.0257, "reward": 0.7291666865348816, "reward_std": 0.2350771278142929, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 139 }, { "completion_length": 568.1250305175781, "epoch": 0.37333333333333335, "grad_norm": 0.1539030224084854, "kl": 0.0248260498046875, "learning_rate": 2.9318814483715983e-06, "loss": -0.0729, "reward": 0.45833334140479565, "reward_std": 0.24859581515192986, "rewards/accuracy_reward": 0.45833334140479565, "rewards/format_reward": 0.0, "step": 140 }, { "completion_length": 805.770881652832, "epoch": 0.376, "grad_norm": 0.17137649655342102, "kl": 0.017730712890625, "learning_rate": 2.9297860016894203e-06, "loss": 0.0541, "reward": 0.5208333488553762, "reward_std": 0.25515518337488174, "rewards/accuracy_reward": 0.5208333488553762, "rewards/format_reward": 0.0, "step": 141 }, { "completion_length": 761.3333511352539, "epoch": 0.37866666666666665, "grad_norm": 0.13019512593746185, "kl": 0.0190277099609375, "learning_rate": 2.9276595834917606e-06, "loss": -0.0356, "reward": 0.2708333432674408, "reward_std": 0.2525113932788372, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 142 }, { "completion_length": 707.4791793823242, "epoch": 0.38133333333333336, "grad_norm": 0.12470466643571854, "kl": 0.018096923828125, "learning_rate": 2.925502239840332e-06, "loss": 0.0384, "reward": 0.5208333432674408, "reward_std": 0.2621144950389862, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 143 }, { "completion_length": 721.7500152587891, "epoch": 0.384, "grad_norm": 0.18000547587871552, "kl": 0.02191162109375, "learning_rate": 2.9233140174667447e-06, "loss": 0.0561, "reward": 0.5208333488553762, "reward_std": 0.23507710918784142, "rewards/accuracy_reward": 0.5208333488553762, "rewards/format_reward": 0.0, "step": 144 }, { "completion_length": 726.9791717529297, "epoch": 0.38666666666666666, "grad_norm": 0.11175241321325302, "kl": 0.0316162109375, "learning_rate": 2.921094963771494e-06, "loss": 0.0123, "reward": 0.7083333432674408, "reward_std": 0.18404608219861984, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 145 }, { "completion_length": 760.1458435058594, "epoch": 0.3893333333333333, "grad_norm": 0.21677181124687195, "kl": 0.022491455078125, "learning_rate": 2.9188451268229305e-06, "loss": 0.0114, "reward": 0.5000000204890966, "reward_std": 0.350657869130373, "rewards/accuracy_reward": 0.5000000204890966, "rewards/format_reward": 0.0, "step": 146 }, { "completion_length": 754.7083435058594, "epoch": 0.392, "grad_norm": 0.14654108881950378, "kl": 0.020782470703125, "learning_rate": 2.9165645553562214e-06, "loss": 0.0552, "reward": 0.4791666716337204, "reward_std": 0.1530931107699871, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.0, "step": 147 }, { "completion_length": 544.9583435058594, "epoch": 0.39466666666666667, "grad_norm": 0.1994720995426178, "kl": 0.02850341796875, "learning_rate": 2.914253298772295e-06, "loss": -0.0785, "reward": 0.6875000298023224, "reward_std": 0.36417656019330025, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 148 }, { "completion_length": 903.3125305175781, "epoch": 0.3973333333333333, "grad_norm": 0.13350743055343628, "kl": 0.016937255859375, "learning_rate": 2.9119114071367674e-06, "loss": -0.0053, "reward": 0.3541666716337204, "reward_std": 0.317061148583889, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 149 }, { "completion_length": 850.8750152587891, "epoch": 0.4, "grad_norm": 0.10395243018865585, "kl": 0.01261138916015625, "learning_rate": 2.9095389311788626e-06, "loss": -0.0109, "reward": 0.5416666828095913, "reward_std": 0.23116153106093407, "rewards/accuracy_reward": 0.5416666828095913, "rewards/format_reward": 0.0, "step": 150 }, { "completion_length": 578.9583435058594, "epoch": 0.4026666666666667, "grad_norm": 0.07950767129659653, "kl": 0.0180816650390625, "learning_rate": 2.9071359222903105e-06, "loss": 0.0049, "reward": 0.7916666865348816, "reward_std": 0.16661180183291435, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 151 }, { "completion_length": 856.2708435058594, "epoch": 0.4053333333333333, "grad_norm": 0.11328104138374329, "kl": 0.02142333984375, "learning_rate": 2.9047024325242336e-06, "loss": 0.0096, "reward": 0.291666679084301, "reward_std": 0.19364918768405914, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.0, "step": 152 }, { "completion_length": 688.9166870117188, "epoch": 0.408, "grad_norm": 0.18654842674732208, "kl": 0.019317626953125, "learning_rate": 2.9022385145940218e-06, "loss": 0.0605, "reward": 0.6875000298023224, "reward_std": 0.38161083683371544, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 153 }, { "completion_length": 811.1250152587891, "epoch": 0.4106666666666667, "grad_norm": 0.16071945428848267, "kl": 0.01629638671875, "learning_rate": 2.899744221872188e-06, "loss": 0.0683, "reward": 0.6250000223517418, "reward_std": 0.3776952847838402, "rewards/accuracy_reward": 0.6250000223517418, "rewards/format_reward": 0.0, "step": 154 }, { "completion_length": 681.9375152587891, "epoch": 0.41333333333333333, "grad_norm": 0.24948441982269287, "kl": 0.02459716796875, "learning_rate": 2.8972196083892137e-06, "loss": 0.1382, "reward": 0.4583333507180214, "reward_std": 0.30354245379567146, "rewards/accuracy_reward": 0.4583333507180214, "rewards/format_reward": 0.0, "step": 155 }, { "completion_length": 705.625, "epoch": 0.416, "grad_norm": 0.40736380219459534, "kl": 0.018951416015625, "learning_rate": 2.894664728832377e-06, "loss": 0.0694, "reward": 0.6458333432674408, "reward_std": 0.33713918551802635, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "step": 156 }, { "completion_length": 583.4375228881836, "epoch": 0.4186666666666667, "grad_norm": 0.19216611981391907, "kl": 0.0235443115234375, "learning_rate": 2.8920796385445705e-06, "loss": 0.1121, "reward": 0.6250000149011612, "reward_std": 0.22155842557549477, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 157 }, { "completion_length": 743.3125152587891, "epoch": 0.42133333333333334, "grad_norm": 0.5183671712875366, "kl": 0.02801513671875, "learning_rate": 2.889464393523099e-06, "loss": -0.0522, "reward": 0.6250000223517418, "reward_std": 0.36809216812253, "rewards/accuracy_reward": 0.6250000223517418, "rewards/format_reward": 0.0, "step": 158 }, { "completion_length": 910.2083740234375, "epoch": 0.424, "grad_norm": 0.146419957280159, "kl": 0.02410888671875, "learning_rate": 2.8868190504184698e-06, "loss": -0.0069, "reward": 0.2500000074505806, "reward_std": 0.3236205168068409, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 159 }, { "completion_length": 737.9791793823242, "epoch": 0.4266666666666667, "grad_norm": 0.2719399034976959, "kl": 0.023284912109375, "learning_rate": 2.8841436665331635e-06, "loss": 0.0355, "reward": 0.541666679084301, "reward_std": 0.2686738818883896, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "step": 160 }, { "completion_length": 564.8125076293945, "epoch": 0.42933333333333334, "grad_norm": 0.13706326484680176, "kl": 0.0176849365234375, "learning_rate": 2.881438299820394e-06, "loss": 0.0708, "reward": 0.8541666865348816, "reward_std": 0.19756478071212769, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 0.0, "step": 161 }, { "completion_length": 583.9375152587891, "epoch": 0.432, "grad_norm": 0.2149602472782135, "kl": 0.0257720947265625, "learning_rate": 2.878703008882852e-06, "loss": 0.0253, "reward": 0.5833333637565374, "reward_std": 0.3506578765809536, "rewards/accuracy_reward": 0.5833333637565374, "rewards/format_reward": 0.0, "step": 162 }, { "completion_length": 768.9375305175781, "epoch": 0.43466666666666665, "grad_norm": 0.1322740614414215, "kl": 0.026580810546875, "learning_rate": 2.8759378529714358e-06, "loss": -0.0072, "reward": 0.5625000149011612, "reward_std": 0.2350771203637123, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 163 }, { "completion_length": 866.8333740234375, "epoch": 0.43733333333333335, "grad_norm": 0.27988889813423157, "kl": 0.0343017578125, "learning_rate": 2.8731428919839684e-06, "loss": 0.0259, "reward": 0.4583333395421505, "reward_std": 0.3680921792984009, "rewards/accuracy_reward": 0.4583333395421505, "rewards/format_reward": 0.0, "step": 164 }, { "completion_length": 720.4166870117188, "epoch": 0.44, "grad_norm": 0.29093390703201294, "kl": 0.040069580078125, "learning_rate": 2.8703181864639013e-06, "loss": -0.0024, "reward": 0.645833358168602, "reward_std": 0.2996268570423126, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 165 }, { "completion_length": 830.4791870117188, "epoch": 0.44266666666666665, "grad_norm": 0.21840965747833252, "kl": 0.04143524169921875, "learning_rate": 2.867463797598999e-06, "loss": 0.1342, "reward": 0.708333358168602, "reward_std": 0.415207602083683, "rewards/accuracy_reward": 0.708333358168602, "rewards/format_reward": 0.0, "step": 166 }, { "completion_length": 838.8125152587891, "epoch": 0.44533333333333336, "grad_norm": 0.13068810105323792, "kl": 0.055908203125, "learning_rate": 2.8645797872200178e-06, "loss": -0.0275, "reward": 0.5833333432674408, "reward_std": 0.31314554065465927, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 167 }, { "completion_length": 860.8125305175781, "epoch": 0.448, "grad_norm": 0.232852965593338, "kl": 0.0428466796875, "learning_rate": 2.861666217799363e-06, "loss": 0.0165, "reward": 0.4375000149011612, "reward_std": 0.31970490515232086, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "step": 168 }, { "completion_length": 885.5208587646484, "epoch": 0.45066666666666666, "grad_norm": 0.34133827686309814, "kl": 0.032501220703125, "learning_rate": 2.8587231524497397e-06, "loss": 0.0144, "reward": 0.6250000223517418, "reward_std": 0.3602609820663929, "rewards/accuracy_reward": 0.6250000223517418, "rewards/format_reward": 0.0, "step": 169 }, { "completion_length": 701.1666793823242, "epoch": 0.4533333333333333, "grad_norm": 0.10134287923574448, "kl": 0.0167388916015625, "learning_rate": 2.855750654922781e-06, "loss": 0.05, "reward": 0.833333358168602, "reward_std": 0.23116152361035347, "rewards/accuracy_reward": 0.833333358168602, "rewards/format_reward": 0.0, "step": 170 }, { "completion_length": 909.3125457763672, "epoch": 0.456, "grad_norm": 0.1822938621044159, "kl": 0.02960205078125, "learning_rate": 2.852748789607671e-06, "loss": 0.1012, "reward": 0.6041666865348816, "reward_std": 0.40168893337249756, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 171 }, { "completion_length": 1057.0625305175781, "epoch": 0.45866666666666667, "grad_norm": 0.15899233520030975, "kl": 0.057373046875, "learning_rate": 2.8497176215297474e-06, "loss": 0.0381, "reward": 0.3958333432674408, "reward_std": 0.44616060703992844, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 172 }, { "completion_length": 730.3750305175781, "epoch": 0.4613333333333333, "grad_norm": 2.6546502113342285, "kl": 0.094482421875, "learning_rate": 2.846657216349094e-06, "loss": 0.1087, "reward": 0.4583333432674408, "reward_std": 0.30354245752096176, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "step": 173 }, { "completion_length": 993.375, "epoch": 0.464, "grad_norm": 0.15241017937660217, "kl": 0.0701904296875, "learning_rate": 2.8435676403591196e-06, "loss": 0.038, "reward": 0.45833334885537624, "reward_std": 0.2861081510782242, "rewards/accuracy_reward": 0.45833334885537624, "rewards/format_reward": 0.0, "step": 174 }, { "completion_length": 867.5208587646484, "epoch": 0.4666666666666667, "grad_norm": 0.32228943705558777, "kl": 0.03955078125, "learning_rate": 2.8404489604851183e-06, "loss": 0.0841, "reward": 0.5625000074505806, "reward_std": 0.38161084800958633, "rewards/accuracy_reward": 0.5625000074505806, "rewards/format_reward": 0.0, "step": 175 }, { "completion_length": 930.9167022705078, "epoch": 0.4693333333333333, "grad_norm": 0.4391748607158661, "kl": 0.055755615234375, "learning_rate": 2.837301244282825e-06, "loss": 0.072, "reward": 0.7291666865348816, "reward_std": 0.334495410323143, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 176 }, { "completion_length": 959.8125, "epoch": 0.472, "grad_norm": 0.30479004979133606, "kl": 0.08367919921875, "learning_rate": 2.8341245599369467e-06, "loss": 0.1316, "reward": 0.4375000111758709, "reward_std": 0.33713920041918755, "rewards/accuracy_reward": 0.4375000111758709, "rewards/format_reward": 0.0, "step": 177 }, { "completion_length": 784.5208587646484, "epoch": 0.4746666666666667, "grad_norm": 0.4404268264770508, "kl": 0.41534423828125, "learning_rate": 2.830918976259689e-06, "loss": 0.0321, "reward": 0.7291666716337204, "reward_std": 0.235077116638422, "rewards/accuracy_reward": 0.7291666716337204, "rewards/format_reward": 0.0, "step": 178 }, { "completion_length": 704.6875076293945, "epoch": 0.47733333333333333, "grad_norm": 0.21096909046173096, "kl": 0.09588623046875, "learning_rate": 2.827684562689265e-06, "loss": -0.0393, "reward": 0.6250000298023224, "reward_std": 0.20412414148449898, "rewards/accuracy_reward": 0.6250000298023224, "rewards/format_reward": 0.0, "step": 179 }, { "completion_length": 905.7708587646484, "epoch": 0.48, "grad_norm": 0.3012356758117676, "kl": 0.159423828125, "learning_rate": 2.8244213892883906e-06, "loss": 0.1138, "reward": 0.45833333395421505, "reward_std": 0.32362050563097, "rewards/accuracy_reward": 0.45833333395421505, "rewards/format_reward": 0.0, "step": 180 }, { "completion_length": 786.2292022705078, "epoch": 0.4826666666666667, "grad_norm": 0.3507545292377472, "kl": 0.203125, "learning_rate": 2.821129526742766e-06, "loss": 0.1097, "reward": 0.4583333432674408, "reward_std": 0.32097672671079636, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "step": 181 }, { "completion_length": 819.5000305175781, "epoch": 0.48533333333333334, "grad_norm": 0.15901023149490356, "kl": 0.1199951171875, "learning_rate": 2.8178090463595464e-06, "loss": 0.0023, "reward": 0.4166666716337204, "reward_std": 0.20148037374019623, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "step": 182 }, { "completion_length": 763.6041870117188, "epoch": 0.488, "grad_norm": 0.9456762671470642, "kl": 0.3079833984375, "learning_rate": 2.814460020065795e-06, "loss": -0.0257, "reward": 0.5416666865348816, "reward_std": 0.26603008806705475, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 183 }, { "completion_length": 770.3125152587891, "epoch": 0.49066666666666664, "grad_norm": 0.6205459833145142, "kl": 0.220458984375, "learning_rate": 2.8110825204069292e-06, "loss": 0.0208, "reward": 0.5000000223517418, "reward_std": 0.4230387955904007, "rewards/accuracy_reward": 0.5000000223517418, "rewards/format_reward": 0.0, "step": 184 }, { "completion_length": 920.4166870117188, "epoch": 0.49333333333333335, "grad_norm": 0.23448574542999268, "kl": 0.085784912109375, "learning_rate": 2.8076766205451433e-06, "loss": 0.1141, "reward": 0.3750000074505806, "reward_std": 0.3680921792984009, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.0, "step": 185 }, { "completion_length": 823.4375305175781, "epoch": 0.496, "grad_norm": 0.1786121129989624, "kl": 0.04027557373046875, "learning_rate": 2.8042423942578284e-06, "loss": 0.0038, "reward": 0.6041666828095913, "reward_std": 0.28219256177544594, "rewards/accuracy_reward": 0.6041666828095913, "rewards/format_reward": 0.0, "step": 186 }, { "completion_length": 816.6041870117188, "epoch": 0.49866666666666665, "grad_norm": 0.18998931348323822, "kl": 0.22528076171875, "learning_rate": 2.800779915935972e-06, "loss": 0.0081, "reward": 0.5625000204890966, "reward_std": 0.2350771240890026, "rewards/accuracy_reward": 0.5625000204890966, "rewards/format_reward": 0.0, "step": 187 }, { "completion_length": 1057.0833587646484, "epoch": 0.5013333333333333, "grad_norm": 0.6255596280097961, "kl": 0.224365234375, "learning_rate": 2.7972892605825464e-06, "loss": 0.0974, "reward": 0.3750000149011612, "reward_std": 0.24859579652547836, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "step": 188 }, { "completion_length": 768.5416870117188, "epoch": 0.504, "grad_norm": 0.36918389797210693, "kl": 0.1812286376953125, "learning_rate": 2.7937705038108863e-06, "loss": -0.0044, "reward": 0.3958333469927311, "reward_std": 0.36417658627033234, "rewards/accuracy_reward": 0.3958333469927311, "rewards/format_reward": 0.0, "step": 189 }, { "completion_length": 756.9375152587891, "epoch": 0.5066666666666667, "grad_norm": 1.1205483675003052, "kl": 0.2040252685546875, "learning_rate": 2.7902237218430485e-06, "loss": 0.2227, "reward": 0.6875000298023224, "reward_std": 0.36417656391859055, "rewards/accuracy_reward": 0.6875000298023224, "rewards/format_reward": 0.0, "step": 190 }, { "completion_length": 843.4166870117188, "epoch": 0.5093333333333333, "grad_norm": 0.6259863972663879, "kl": 0.201416015625, "learning_rate": 2.7866489915081606e-06, "loss": 0.1422, "reward": 0.3958333544433117, "reward_std": 0.38161084800958633, "rewards/accuracy_reward": 0.3958333544433117, "rewards/format_reward": 0.0, "step": 191 }, { "completion_length": 1043.2292175292969, "epoch": 0.512, "grad_norm": 3.836799144744873, "kl": 0.7412109375, "learning_rate": 2.78304639024076e-06, "loss": 0.2273, "reward": 0.4583333358168602, "reward_std": 0.2686738818883896, "rewards/accuracy_reward": 0.4583333358168602, "rewards/format_reward": 0.0, "step": 192 }, { "completion_length": 1042.2291870117188, "epoch": 0.5146666666666667, "grad_norm": 3.3550195693969727, "kl": 0.6171875, "learning_rate": 2.7794159960791125e-06, "loss": 0.1258, "reward": 0.5625000149011612, "reward_std": 0.1705274023115635, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 193 }, { "completion_length": 1017.2500305175781, "epoch": 0.5173333333333333, "grad_norm": 12.088400840759277, "kl": 1.6103515625, "learning_rate": 2.775757887663525e-06, "loss": 0.2264, "reward": 0.45833333395421505, "reward_std": 0.3977733328938484, "rewards/accuracy_reward": 0.45833333395421505, "rewards/format_reward": 0.0, "step": 194 }, { "completion_length": 995.3542175292969, "epoch": 0.52, "grad_norm": 4.430953502655029, "kl": 0.4501953125, "learning_rate": 2.772072144234639e-06, "loss": 0.0376, "reward": 0.3750000111758709, "reward_std": 0.3776952587068081, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.0, "step": 195 }, { "completion_length": 707.4375076293945, "epoch": 0.5226666666666666, "grad_norm": 2.779906749725342, "kl": 0.2867431640625, "learning_rate": 2.7683588456317177e-06, "loss": 0.0161, "reward": 0.6250000223517418, "reward_std": 0.38552645593881607, "rewards/accuracy_reward": 0.6250000223517418, "rewards/format_reward": 0.0, "step": 196 }, { "completion_length": 682.6250152587891, "epoch": 0.5253333333333333, "grad_norm": 1.7151806354522705, "kl": 0.238525390625, "learning_rate": 2.764618072290913e-06, "loss": -0.0224, "reward": 0.37500000558793545, "reward_std": 0.32097671553492546, "rewards/accuracy_reward": 0.37500000558793545, "rewards/format_reward": 0.0, "step": 197 }, { "completion_length": 908.3541717529297, "epoch": 0.528, "grad_norm": 5.420147895812988, "kl": 0.329833984375, "learning_rate": 2.7608499052435266e-06, "loss": 0.0899, "reward": 0.583333358168602, "reward_std": 0.4500761702656746, "rewards/accuracy_reward": 0.583333358168602, "rewards/format_reward": 0.0, "step": 198 }, { "completion_length": 823.0208587646484, "epoch": 0.5306666666666666, "grad_norm": 2.1091978549957275, "kl": 0.2232666015625, "learning_rate": 2.757054426114251e-06, "loss": 0.0821, "reward": 0.666666679084301, "reward_std": 0.22155843675136566, "rewards/accuracy_reward": 0.666666679084301, "rewards/format_reward": 0.0, "step": 199 }, { "completion_length": 670.4583587646484, "epoch": 0.5333333333333333, "grad_norm": 3.2582030296325684, "kl": 0.3426513671875, "learning_rate": 2.753231717119405e-06, "loss": -0.0217, "reward": 0.6250000149011612, "reward_std": 0.2861081585288048, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 200 }, { "completion_length": 620.9583435058594, "epoch": 0.536, "grad_norm": 2.6548030376434326, "kl": 0.317626953125, "learning_rate": 2.749381861065149e-06, "loss": 0.0096, "reward": 0.4375000111758709, "reward_std": 0.28219256177544594, "rewards/accuracy_reward": 0.4375000111758709, "rewards/format_reward": 0.0, "step": 201 }, { "completion_length": 744.6041870117188, "epoch": 0.5386666666666666, "grad_norm": 1.7339693307876587, "kl": 0.538330078125, "learning_rate": 2.7455049413456964e-06, "loss": 0.0956, "reward": 0.47916669212281704, "reward_std": 0.28219255805015564, "rewards/accuracy_reward": 0.47916669212281704, "rewards/format_reward": 0.0, "step": 202 }, { "completion_length": 586.5208511352539, "epoch": 0.5413333333333333, "grad_norm": 4.914543151855469, "kl": 0.373291015625, "learning_rate": 2.741601041941501e-06, "loss": 0.0462, "reward": 0.645833358168602, "reward_std": 0.33713919669389725, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 203 }, { "completion_length": 734.645866394043, "epoch": 0.544, "grad_norm": 4.155093193054199, "kl": 0.401123046875, "learning_rate": 2.7376702474174426e-06, "loss": -0.0151, "reward": 0.604166679084301, "reward_std": 0.2350771352648735, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "step": 204 }, { "completion_length": 737.4375305175781, "epoch": 0.5466666666666666, "grad_norm": 3.2076289653778076, "kl": 0.97149658203125, "learning_rate": 2.7337126429209934e-06, "loss": 0.0455, "reward": 0.5625000149011612, "reward_std": 0.33713920041918755, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 205 }, { "completion_length": 620.2500152587891, "epoch": 0.5493333333333333, "grad_norm": 28.125852584838867, "kl": 4.544921875, "learning_rate": 2.729728314180373e-06, "loss": 0.0901, "reward": 0.479166679084301, "reward_std": 0.42872631549835205, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "step": 206 }, { "completion_length": 894.8333587646484, "epoch": 0.552, "grad_norm": 43.36728286743164, "kl": 1.431640625, "learning_rate": 2.725717347502693e-06, "loss": 0.14, "reward": 0.3958333432674408, "reward_std": 0.36417659372091293, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "step": 207 }, { "completion_length": 730.4583587646484, "epoch": 0.5546666666666666, "grad_norm": 15.029769897460938, "kl": 1.0419921875, "learning_rate": 2.7216798297720855e-06, "loss": 0.0142, "reward": 0.2916666753590107, "reward_std": 0.2861081659793854, "rewards/accuracy_reward": 0.2916666753590107, "rewards/format_reward": 0.0, "step": 208 }, { "completion_length": 599.1666870117188, "epoch": 0.5573333333333333, "grad_norm": 2.065986394882202, "kl": 0.42669677734375, "learning_rate": 2.7176158484478224e-06, "loss": 0.0308, "reward": 0.5416666716337204, "reward_std": 0.3776952847838402, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "step": 209 }, { "completion_length": 697.2083435058594, "epoch": 0.56, "grad_norm": 0.4965348541736603, "kl": 0.12176513671875, "learning_rate": 2.713525491562421e-06, "loss": 0.0151, "reward": 0.5416666772216558, "reward_std": 0.23899271339178085, "rewards/accuracy_reward": 0.5416666772216558, "rewards/format_reward": 0.0, "step": 210 }, { "completion_length": 594.5208587646484, "epoch": 0.5626666666666666, "grad_norm": 8.045109748840332, "kl": 0.330322265625, "learning_rate": 2.709408847719737e-06, "loss": 0.0581, "reward": 0.5625000111758709, "reward_std": 0.37377967685461044, "rewards/accuracy_reward": 0.5625000111758709, "rewards/format_reward": 0.0, "step": 211 }, { "completion_length": 852.3333435058594, "epoch": 0.5653333333333334, "grad_norm": 2.485161066055298, "kl": 0.36962890625, "learning_rate": 2.705266006093043e-06, "loss": 0.0566, "reward": 0.645833358168602, "reward_std": 0.37377968057990074, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 212 }, { "completion_length": 788.0833435058594, "epoch": 0.568, "grad_norm": 2.3095505237579346, "kl": 0.4625244140625, "learning_rate": 2.7010970564231e-06, "loss": 0.0677, "reward": 0.6041666828095913, "reward_std": 0.27258946746587753, "rewards/accuracy_reward": 0.6041666828095913, "rewards/format_reward": 0.0, "step": 213 }, { "completion_length": 673.5833587646484, "epoch": 0.5706666666666667, "grad_norm": 40.70173645019531, "kl": 5.20703125, "learning_rate": 2.696902089016213e-06, "loss": 0.3097, "reward": 0.4791666753590107, "reward_std": 0.40952012687921524, "rewards/accuracy_reward": 0.4791666753590107, "rewards/format_reward": 0.0, "step": 214 }, { "completion_length": 735.2708511352539, "epoch": 0.5733333333333334, "grad_norm": 5.52685022354126, "kl": 1.55078125, "learning_rate": 2.6926811947422717e-06, "loss": -0.021, "reward": 0.5208333358168602, "reward_std": 0.3170611336827278, "rewards/accuracy_reward": 0.5208333358168602, "rewards/format_reward": 0.0, "step": 215 }, { "completion_length": 659.0000228881836, "epoch": 0.576, "grad_norm": 7.676098346710205, "kl": 0.54638671875, "learning_rate": 2.688434465032786e-06, "loss": 0.132, "reward": 0.5833333432674408, "reward_std": 0.4326419085264206, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 216 }, { "completion_length": 651.1666717529297, "epoch": 0.5786666666666667, "grad_norm": 2.9158401489257812, "kl": 0.712890625, "learning_rate": 2.6841619918789038e-06, "loss": 0.0471, "reward": 0.6250000149011612, "reward_std": 0.26603010296821594, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 217 }, { "completion_length": 708.9166870117188, "epoch": 0.5813333333333334, "grad_norm": 7.161975383758545, "kl": 1.939453125, "learning_rate": 2.679863867829417e-06, "loss": 0.199, "reward": 0.3750000149011612, "reward_std": 0.26603008806705475, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "step": 218 }, { "completion_length": 791.8958435058594, "epoch": 0.584, "grad_norm": 3.1737961769104004, "kl": 0.8603515625, "learning_rate": 2.67554018598876e-06, "loss": 0.0281, "reward": 0.479166679084301, "reward_std": 0.2350771278142929, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "step": 219 }, { "completion_length": 940.2708435058594, "epoch": 0.5866666666666667, "grad_norm": 4.666038990020752, "kl": 0.8388671875, "learning_rate": 2.671191040014989e-06, "loss": 0.0128, "reward": 0.541666679084301, "reward_std": 0.3506578803062439, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "step": 220 }, { "completion_length": 721.3958435058594, "epoch": 0.5893333333333334, "grad_norm": 0.6132449507713318, "kl": 0.56298828125, "learning_rate": 2.666816524117757e-06, "loss": -0.0265, "reward": 0.2708333395421505, "reward_std": 0.2446802221238613, "rewards/accuracy_reward": 0.2708333395421505, "rewards/format_reward": 0.0, "step": 221 }, { "completion_length": 859.0416870117188, "epoch": 0.592, "grad_norm": 4.1063385009765625, "kl": 0.29248046875, "learning_rate": 2.6624167330562694e-06, "loss": -0.0277, "reward": 0.4791666828095913, "reward_std": 0.34674228727817535, "rewards/accuracy_reward": 0.4791666828095913, "rewards/format_reward": 0.0, "step": 222 }, { "completion_length": 823.2291870117188, "epoch": 0.5946666666666667, "grad_norm": 1.2631244659423828, "kl": 0.5458984375, "learning_rate": 2.657991762137235e-06, "loss": 0.0232, "reward": 0.2500000037252903, "reward_std": 0.2957112602889538, "rewards/accuracy_reward": 0.2500000037252903, "rewards/format_reward": 0.0, "step": 223 }, { "completion_length": 575.0000152587891, "epoch": 0.5973333333333334, "grad_norm": 5.164083957672119, "kl": 3.855224609375, "learning_rate": 2.653541707212799e-06, "loss": -0.0518, "reward": 0.3333333395421505, "reward_std": 0.3332235999405384, "rewards/accuracy_reward": 0.3333333395421505, "rewards/format_reward": 0.0, "step": 224 }, { "completion_length": 526.0000228881836, "epoch": 0.6, "grad_norm": 4.015169143676758, "kl": 1.4453125, "learning_rate": 2.649066664678467e-06, "loss": -0.0142, "reward": 0.6250000223517418, "reward_std": 0.4230388030409813, "rewards/accuracy_reward": 0.6250000223517418, "rewards/format_reward": 0.0, "step": 225 }, { "completion_length": 660.2083587646484, "epoch": 0.6026666666666667, "grad_norm": 2.765709400177002, "kl": 2.708984375, "learning_rate": 2.6445667314710174e-06, "loss": 0.0736, "reward": 0.4375000149011612, "reward_std": 0.21764283254742622, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "step": 226 }, { "completion_length": 628.0000076293945, "epoch": 0.6053333333333333, "grad_norm": 1.9771387577056885, "kl": 0.358154296875, "learning_rate": 2.6400420050664027e-06, "loss": 0.0267, "reward": 0.5833333432674408, "reward_std": 0.3506578728556633, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "step": 227 }, { "completion_length": 596.3750228881836, "epoch": 0.608, "grad_norm": 9.701559066772461, "kl": 2.94921875, "learning_rate": 2.6354925834776346e-06, "loss": 0.1108, "reward": 0.6666666865348816, "reward_std": 0.40296073257923126, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 228 }, { "completion_length": 695.5208587646484, "epoch": 0.6106666666666667, "grad_norm": 10.11015796661377, "kl": 2.19921875, "learning_rate": 2.6309185652526653e-06, "loss": 0.1138, "reward": 0.29166667349636555, "reward_std": 0.31314554437994957, "rewards/accuracy_reward": 0.29166667349636555, "rewards/format_reward": 0.0, "step": 229 }, { "completion_length": 897.9583435058594, "epoch": 0.6133333333333333, "grad_norm": 4.603687763214111, "kl": 2.24267578125, "learning_rate": 2.626320049472249e-06, "loss": 0.0939, "reward": 0.3958333469927311, "reward_std": 0.3266642242670059, "rewards/accuracy_reward": 0.3958333469927311, "rewards/format_reward": 0.0, "step": 230 }, { "completion_length": 647.6041870117188, "epoch": 0.616, "grad_norm": 3.770287036895752, "kl": 1.5478515625, "learning_rate": 2.621697135747798e-06, "loss": 0.0139, "reward": 0.4791666865348816, "reward_std": 0.3720077723264694, "rewards/accuracy_reward": 0.4791666865348816, "rewards/format_reward": 0.0, "step": 231 }, { "completion_length": 513.5833435058594, "epoch": 0.6186666666666667, "grad_norm": 2.456714153289795, "kl": 2.1630859375, "learning_rate": 2.6170499242192243e-06, "loss": 0.0842, "reward": 0.4375000149011612, "reward_std": 0.2446802221238613, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "step": 232 }, { "completion_length": 921.4583740234375, "epoch": 0.6213333333333333, "grad_norm": 3.1872363090515137, "kl": 2.4111328125, "learning_rate": 2.6123785155527693e-06, "loss": 0.0178, "reward": 0.5000000074505806, "reward_std": 0.4797573611140251, "rewards/accuracy_reward": 0.5000000074505806, "rewards/format_reward": 0.0, "step": 233 }, { "completion_length": 564.6458511352539, "epoch": 0.624, "grad_norm": 2.388471841812134, "kl": 1.87744140625, "learning_rate": 2.607683010938826e-06, "loss": -0.0138, "reward": 0.37500001303851604, "reward_std": 0.2861081659793854, "rewards/accuracy_reward": 0.37500001303851604, "rewards/format_reward": 0.0, "step": 234 }, { "completion_length": 594.4166870117188, "epoch": 0.6266666666666667, "grad_norm": 3.252562999725342, "kl": 0.6201171875, "learning_rate": 2.6029635120897432e-06, "loss": 0.008, "reward": 0.3333333432674408, "reward_std": 0.3680921792984009, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 235 }, { "completion_length": 556.0208435058594, "epoch": 0.6293333333333333, "grad_norm": 4.7144317626953125, "kl": 0.822265625, "learning_rate": 2.5982201212376253e-06, "loss": 0.1413, "reward": 0.3958333507180214, "reward_std": 0.35457348451018333, "rewards/accuracy_reward": 0.3958333507180214, "rewards/format_reward": 0.0, "step": 236 }, { "completion_length": 661.8541870117188, "epoch": 0.632, "grad_norm": 2.591197967529297, "kl": 0.4580078125, "learning_rate": 2.5934529411321173e-06, "loss": 0.1095, "reward": 0.5000000149011612, "reward_std": 0.37592336907982826, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "step": 237 }, { "completion_length": 802.5625305175781, "epoch": 0.6346666666666667, "grad_norm": 2.123358964920044, "kl": 1.932373046875, "learning_rate": 2.588662075038178e-06, "loss": 0.2456, "reward": 0.3125000074505806, "reward_std": 0.2525114193558693, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "step": 238 }, { "completion_length": 822.4583740234375, "epoch": 0.6373333333333333, "grad_norm": 3.56559157371521, "kl": 2.3046875, "learning_rate": 2.583847626733842e-06, "loss": 0.194, "reward": 0.458333358168602, "reward_std": 0.3506578654050827, "rewards/accuracy_reward": 0.458333358168602, "rewards/format_reward": 0.0, "step": 239 }, { "completion_length": 815.1458435058594, "epoch": 0.64, "grad_norm": 1.742954134941101, "kl": 1.6240234375, "learning_rate": 2.5790097005079765e-06, "loss": 0.2733, "reward": 0.3333333358168602, "reward_std": 0.4056045189499855, "rewards/accuracy_reward": 0.3333333358168602, "rewards/format_reward": 0.0, "step": 240 }, { "completion_length": 674.9375457763672, "epoch": 0.6426666666666667, "grad_norm": 1.7041113376617432, "kl": 0.5087890625, "learning_rate": 2.574148401158017e-06, "loss": 0.0284, "reward": 0.416666679084301, "reward_std": 0.24859580025076866, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "step": 241 }, { "completion_length": 524.5833511352539, "epoch": 0.6453333333333333, "grad_norm": 2.1262567043304443, "kl": 0.33056640625, "learning_rate": 2.5692638339877007e-06, "loss": 0.0999, "reward": 0.7291666865348816, "reward_std": 0.37377967685461044, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 242 }, { "completion_length": 697.5, "epoch": 0.648, "grad_norm": 0.536224365234375, "kl": 1.41162109375, "learning_rate": 2.5643561048047816e-06, "loss": 0.0552, "reward": 0.39583333395421505, "reward_std": 0.235077116638422, "rewards/accuracy_reward": 0.39583333395421505, "rewards/format_reward": 0.0, "step": 243 }, { "completion_length": 722.6250305175781, "epoch": 0.6506666666666666, "grad_norm": 0.7712324261665344, "kl": 1.03076171875, "learning_rate": 2.559425319918743e-06, "loss": 0.0207, "reward": 0.5000000111758709, "reward_std": 0.19364918768405914, "rewards/accuracy_reward": 0.5000000111758709, "rewards/format_reward": 0.0, "step": 244 }, { "completion_length": 853.6875152587891, "epoch": 0.6533333333333333, "grad_norm": 115.63752746582031, "kl": 3.2158203125, "learning_rate": 2.5544715861384928e-06, "loss": 0.3086, "reward": 0.0833333358168602, "reward_std": 0.16661179810762405, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 245 }, { "completion_length": 890.3958587646484, "epoch": 0.656, "grad_norm": 1.7879916429519653, "kl": 1.380859375, "learning_rate": 2.549495010770048e-06, "loss": 0.0694, "reward": 0.14583333395421505, "reward_std": 0.18796167895197868, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 246 }, { "completion_length": 719.2916870117188, "epoch": 0.6586666666666666, "grad_norm": 0.9578667283058167, "kl": 1.2451171875, "learning_rate": 2.5444957016142144e-06, "loss": 0.0935, "reward": 0.18750000186264515, "reward_std": 0.25515517219901085, "rewards/accuracy_reward": 0.18750000186264515, "rewards/format_reward": 0.0, "step": 247 }, { "completion_length": 832.8958587646484, "epoch": 0.6613333333333333, "grad_norm": 0.5120651125907898, "kl": 0.66162109375, "learning_rate": 2.5394737669642457e-06, "loss": -0.0347, "reward": 0.2500000074505806, "reward_std": 0.3131455257534981, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 248 }, { "completion_length": 647.9791870117188, "epoch": 0.664, "grad_norm": 0.7588649392127991, "kl": 0.511962890625, "learning_rate": 2.5344293156035046e-06, "loss": -0.0722, "reward": 0.2916666753590107, "reward_std": 0.3872983753681183, "rewards/accuracy_reward": 0.2916666753590107, "rewards/format_reward": 0.0, "step": 249 }, { "completion_length": 553.7708435058594, "epoch": 0.6666666666666666, "grad_norm": 2.4626846313476562, "kl": 0.6552734375, "learning_rate": 2.529362456803101e-06, "loss": -0.0393, "reward": 0.4583333432674408, "reward_std": 0.3506578877568245, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "step": 250 }, { "completion_length": 539.1666793823242, "epoch": 0.6693333333333333, "grad_norm": 1.022650957107544, "kl": 0.418212890625, "learning_rate": 2.5242733003195252e-06, "loss": 0.0062, "reward": 0.520833358168602, "reward_std": 0.1801304928958416, "rewards/accuracy_reward": 0.520833358168602, "rewards/format_reward": 0.0, "step": 251 }, { "completion_length": 657.2083587646484, "epoch": 0.672, "grad_norm": 1.0106619596481323, "kl": 0.384521484375, "learning_rate": 2.519161956392275e-06, "loss": 0.086, "reward": 0.416666679084301, "reward_std": 0.32097672671079636, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "step": 252 }, { "completion_length": 598.6666717529297, "epoch": 0.6746666666666666, "grad_norm": 0.43861910700798035, "kl": 0.755126953125, "learning_rate": 2.514028535741463e-06, "loss": 0.0453, "reward": 0.4583333432674408, "reward_std": 0.30354243889451027, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "step": 253 }, { "completion_length": 729.8333587646484, "epoch": 0.6773333333333333, "grad_norm": 0.8300098776817322, "kl": 1.365478515625, "learning_rate": 2.5088731495654205e-06, "loss": -0.0343, "reward": 0.2916666716337204, "reward_std": 0.30354245007038116, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 254 }, { "completion_length": 585.8750152587891, "epoch": 0.68, "grad_norm": 3.1680819988250732, "kl": 1.72802734375, "learning_rate": 2.5036959095382875e-06, "loss": 0.0543, "reward": 0.2291666679084301, "reward_std": 0.21764283627271652, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.0, "step": 255 }, { "completion_length": 701.7083587646484, "epoch": 0.6826666666666666, "grad_norm": 3.7772672176361084, "kl": 1.54052734375, "learning_rate": 2.4984969278075954e-06, "loss": 0.0048, "reward": 0.22916666977107525, "reward_std": 0.30745804682374, "rewards/accuracy_reward": 0.22916666977107525, "rewards/format_reward": 0.0, "step": 256 }, { "completion_length": 692.0000076293945, "epoch": 0.6853333333333333, "grad_norm": 11.947521209716797, "kl": 3.556640625, "learning_rate": 2.4932763169918353e-06, "loss": 0.0981, "reward": 0.18750000186264515, "reward_std": 0.2525114119052887, "rewards/accuracy_reward": 0.18750000186264515, "rewards/format_reward": 0.0, "step": 257 }, { "completion_length": 586.1458511352539, "epoch": 0.688, "grad_norm": 1.9000444412231445, "kl": 1.68359375, "learning_rate": 2.4880341901780208e-06, "loss": 0.0541, "reward": 0.1875000111758709, "reward_std": 0.23507710546255112, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 258 }, { "completion_length": 483.18750762939453, "epoch": 0.6906666666666667, "grad_norm": 1.4334735870361328, "kl": 2.0693359375, "learning_rate": 2.4827706609192375e-06, "loss": -0.0813, "reward": 0.2500000037252903, "reward_std": 0.31314555555582047, "rewards/accuracy_reward": 0.2500000037252903, "rewards/format_reward": 0.0, "step": 259 }, { "completion_length": 479.43751525878906, "epoch": 0.6933333333333334, "grad_norm": 1.8273588418960571, "kl": 1.2958984375, "learning_rate": 2.477485843232183e-06, "loss": 0.031, "reward": 0.06250000186264515, "reward_std": 0.1530931107699871, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 260 }, { "completion_length": 468.50001525878906, "epoch": 0.696, "grad_norm": 10.838756561279297, "kl": 0.923828125, "learning_rate": 2.4721798515946964e-06, "loss": -0.005, "reward": 0.2083333358168602, "reward_std": 0.3061862140893936, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 261 }, { "completion_length": 668.9375152587891, "epoch": 0.6986666666666667, "grad_norm": 1.042129635810852, "kl": 0.728515625, "learning_rate": 2.4668528009432804e-06, "loss": -0.013, "reward": 0.1041666679084301, "reward_std": 0.13301505148410797, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 262 }, { "completion_length": 398.3958511352539, "epoch": 0.7013333333333334, "grad_norm": 3.4529807567596436, "kl": 0.791015625, "learning_rate": 2.4615048066706103e-06, "loss": -0.0484, "reward": 0.06250000186264515, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 263 }, { "completion_length": 423.5208435058594, "epoch": 0.704, "grad_norm": 0.7864275574684143, "kl": 0.6142578125, "learning_rate": 2.456135984623035e-06, "loss": -0.0284, "reward": 0.0416666679084301, "reward_std": 0.10206206887960434, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 264 }, { "completion_length": 473.39584732055664, "epoch": 0.7066666666666667, "grad_norm": 3.637108564376831, "kl": 0.5068359375, "learning_rate": 2.4507464510980654e-06, "loss": -0.04, "reward": 0.2708333469927311, "reward_std": 0.30922994762659073, "rewards/accuracy_reward": 0.2708333469927311, "rewards/format_reward": 0.0, "step": 265 }, { "completion_length": 455.2291793823242, "epoch": 0.7093333333333334, "grad_norm": 1.2332038879394531, "kl": 0.580078125, "learning_rate": 2.44533632284186e-06, "loss": 0.0084, "reward": 0.14583333767950535, "reward_std": 0.2350771315395832, "rewards/accuracy_reward": 0.14583333767950535, "rewards/format_reward": 0.0, "step": 266 }, { "completion_length": 636.5625152587891, "epoch": 0.712, "grad_norm": 4.876010417938232, "kl": 0.791015625, "learning_rate": 2.439905717046691e-06, "loss": -0.0555, "reward": 0.16666666977107525, "reward_std": 0.18404608592391014, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.0, "step": 267 }, { "completion_length": 663.1041717529297, "epoch": 0.7146666666666667, "grad_norm": 1.758858561515808, "kl": 0.2115478515625, "learning_rate": 2.434454751348408e-06, "loss": 0.0277, "reward": 0.5625000149011612, "reward_std": 0.4932760149240494, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 268 }, { "completion_length": 617.1042022705078, "epoch": 0.7173333333333334, "grad_norm": 2.5681865215301514, "kl": 0.284912109375, "learning_rate": 2.4289835438238904e-06, "loss": -0.0989, "reward": 0.3958333469927311, "reward_std": 0.4758417531847954, "rewards/accuracy_reward": 0.3958333469927311, "rewards/format_reward": 0.0, "step": 269 }, { "completion_length": 676.3541870117188, "epoch": 0.72, "grad_norm": 0.852873682975769, "kl": 0.2982177734375, "learning_rate": 2.4234922129884873e-06, "loss": -0.0539, "reward": 0.39583334885537624, "reward_std": 0.40168890357017517, "rewards/accuracy_reward": 0.39583334885537624, "rewards/format_reward": 0.0, "step": 270 }, { "completion_length": 613.6041870117188, "epoch": 0.7226666666666667, "grad_norm": 4.721111297607422, "kl": 0.610107421875, "learning_rate": 2.417980877793454e-06, "loss": 0.0059, "reward": 0.2291666753590107, "reward_std": 0.24468021839857101, "rewards/accuracy_reward": 0.2291666753590107, "rewards/format_reward": 0.0, "step": 271 }, { "completion_length": 618.0000152587891, "epoch": 0.7253333333333334, "grad_norm": 6.144564151763916, "kl": 1.78515625, "learning_rate": 2.4124496576233714e-06, "loss": 0.0333, "reward": 0.1666666679084301, "reward_std": 0.23899272084236145, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 272 }, { "completion_length": 540.5833358764648, "epoch": 0.728, "grad_norm": 16.126615524291992, "kl": 0.8798828125, "learning_rate": 2.4068986722935626e-06, "loss": -0.0133, "reward": 0.1875000074505806, "reward_std": 0.24468021839857101, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 273 }, { "completion_length": 453.3958435058594, "epoch": 0.7306666666666667, "grad_norm": 235.8284454345703, "kl": 1.6103515625, "learning_rate": 2.4013280420474953e-06, "loss": 0.0641, "reward": 0.06250000186264515, "reward_std": 0.11558076739311218, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 274 }, { "completion_length": 458.7291793823242, "epoch": 0.7333333333333333, "grad_norm": 5939.08154296875, "kl": 16.091796875, "learning_rate": 2.3957378875541795e-06, "loss": 0.8365, "reward": 0.0625, "reward_std": 0.06846532225608826, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 275 }, { "completion_length": 541.1666946411133, "epoch": 0.736, "grad_norm": 7498.9140625, "kl": 78.00390625, "learning_rate": 2.3901283299055523e-06, "loss": 2.987, "reward": 0.12500000558793545, "reward_std": 0.10206207260489464, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 276 }, { "completion_length": 546.1666946411133, "epoch": 0.7386666666666667, "grad_norm": 80.91847229003906, "kl": 7.0234375, "learning_rate": 2.3844994906138548e-06, "loss": 0.1409, "reward": 0.14583333395421505, "reward_std": 0.18796168267726898, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 277 }, { "completion_length": 548.6666870117188, "epoch": 0.7413333333333333, "grad_norm": 30603.955078125, "kl": 68.61328125, "learning_rate": 2.3788514916090007e-06, "loss": 2.6195, "reward": 0.1875, "reward_std": 0.2525114044547081, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 278 }, { "completion_length": 619.6458587646484, "epoch": 0.744, "grad_norm": 126.99922943115234, "kl": 2.765625, "learning_rate": 2.3731844552359343e-06, "loss": 0.1247, "reward": 0.06250000186264515, "reward_std": 0.11558076366782188, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 279 }, { "completion_length": 622.7083435058594, "epoch": 0.7466666666666667, "grad_norm": 220.41571044921875, "kl": 1.99169921875, "learning_rate": 2.36749850425198e-06, "loss": 0.0629, "reward": 0.1875000037252903, "reward_std": 0.19756478071212769, "rewards/accuracy_reward": 0.1875000037252903, "rewards/format_reward": 0.0, "step": 280 }, { "completion_length": 778.5625305175781, "epoch": 0.7493333333333333, "grad_norm": 179.80470275878906, "kl": 0.821533203125, "learning_rate": 2.3617937618241844e-06, "loss": 0.0322, "reward": 0.22916667349636555, "reward_std": 0.35457346215844154, "rewards/accuracy_reward": 0.22916667349636555, "rewards/format_reward": 0.0, "step": 281 }, { "completion_length": 634.8541793823242, "epoch": 0.752, "grad_norm": 10.700740814208984, "kl": 1.4990234375, "learning_rate": 2.356070351526648e-06, "loss": 0.0582, "reward": 0.14583333767950535, "reward_std": 0.1801304928958416, "rewards/accuracy_reward": 0.14583333767950535, "rewards/format_reward": 0.0, "step": 282 }, { "completion_length": 759.5208740234375, "epoch": 0.7546666666666667, "grad_norm": 9.8982572555542, "kl": 2.18798828125, "learning_rate": 2.3503283973378465e-06, "loss": 0.1048, "reward": 0.12500000558793545, "reward_std": 0.10206207633018494, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 283 }, { "completion_length": 692.6666717529297, "epoch": 0.7573333333333333, "grad_norm": 3.14382004737854, "kl": 0.900390625, "learning_rate": 2.344568023637949e-06, "loss": 0.0795, "reward": 0.16666666977107525, "reward_std": 0.24859581515192986, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.0, "step": 284 }, { "completion_length": 646.4583587646484, "epoch": 0.76, "grad_norm": 662.5714111328125, "kl": 3.06103515625, "learning_rate": 2.3387893552061204e-06, "loss": 0.1911, "reward": 0.1875, "reward_std": 0.06846532225608826, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 285 }, { "completion_length": 568.3750228881836, "epoch": 0.7626666666666667, "grad_norm": 68222.53125, "kl": 257.8154296875, "learning_rate": 2.332992517217819e-06, "loss": 7.0561, "reward": 0.2916666828095913, "reward_std": 0.16661180555820465, "rewards/accuracy_reward": 0.2916666828095913, "rewards/format_reward": 0.0, "step": 286 }, { "completion_length": 665.0625305175781, "epoch": 0.7653333333333333, "grad_norm": 3179.5244140625, "kl": 2.095703125, "learning_rate": 2.327177635242086e-06, "loss": 0.0406, "reward": 0.1250000037252903, "reward_std": 0.22155844047665596, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 287 }, { "completion_length": 644.2083511352539, "epoch": 0.768, "grad_norm": 36765.8671875, "kl": 173.8992919921875, "learning_rate": 2.3213448352388254e-06, "loss": 7.3332, "reward": 0.47916667722165585, "reward_std": 0.1530931070446968, "rewards/accuracy_reward": 0.47916667722165585, "rewards/format_reward": 0.0, "step": 288 }, { "completion_length": 546.6250076293945, "epoch": 0.7706666666666667, "grad_norm": 999.5093383789062, "kl": 4.895263671875, "learning_rate": 2.315494243556075e-06, "loss": 0.1335, "reward": 0.2291666753590107, "reward_std": 0.2350771315395832, "rewards/accuracy_reward": 0.2291666753590107, "rewards/format_reward": 0.0, "step": 289 }, { "completion_length": 656.7083435058594, "epoch": 0.7733333333333333, "grad_norm": 4107.3349609375, "kl": 6.30517578125, "learning_rate": 2.3096259869272697e-06, "loss": 0.2397, "reward": 0.2291666753590107, "reward_std": 0.2621144950389862, "rewards/accuracy_reward": 0.2291666753590107, "rewards/format_reward": 0.0, "step": 290 }, { "completion_length": 636.6875152587891, "epoch": 0.776, "grad_norm": 7.706014633178711, "kl": 0.93994140625, "learning_rate": 2.303740192468495e-06, "loss": 0.0058, "reward": 0.18750000186264515, "reward_std": 0.21764283999800682, "rewards/accuracy_reward": 0.18750000186264515, "rewards/format_reward": 0.0, "step": 291 }, { "completion_length": 515.2083511352539, "epoch": 0.7786666666666666, "grad_norm": 4894.8564453125, "kl": 180.23291015625, "learning_rate": 2.2978369876757365e-06, "loss": 8.3769, "reward": 0.2291666753590107, "reward_std": 0.21764283999800682, "rewards/accuracy_reward": 0.2291666753590107, "rewards/format_reward": 0.0, "step": 292 }, { "completion_length": 626.1666870117188, "epoch": 0.7813333333333333, "grad_norm": 2025.8204345703125, "kl": 36.9638671875, "learning_rate": 2.2919165004221152e-06, "loss": 1.4316, "reward": 0.5208333432674408, "reward_std": 0.1705273911356926, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 293 }, { "completion_length": 644.4166870117188, "epoch": 0.784, "grad_norm": 2703.410888671875, "kl": 4.21484375, "learning_rate": 2.285978858955119e-06, "loss": 0.1355, "reward": 0.3541666716337204, "reward_std": 0.2900237664580345, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 294 }, { "completion_length": 605.5000228881836, "epoch": 0.7866666666666666, "grad_norm": 52.43381881713867, "kl": 1.0858154296875, "learning_rate": 2.280024191893823e-06, "loss": 0.0851, "reward": 0.5416666865348816, "reward_std": 0.20412414148449898, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 295 }, { "completion_length": 741.2916870117188, "epoch": 0.7893333333333333, "grad_norm": 93.758056640625, "kl": 0.751220703125, "learning_rate": 2.274052628226107e-06, "loss": -0.0145, "reward": 0.3958333358168602, "reward_std": 0.39121396839618683, "rewards/accuracy_reward": 0.3958333358168602, "rewards/format_reward": 0.0, "step": 296 }, { "completion_length": 699.8958587646484, "epoch": 0.792, "grad_norm": 62896.68359375, "kl": 9.72021484375, "learning_rate": 2.268064297305857e-06, "loss": 0.5246, "reward": 0.2083333395421505, "reward_std": 0.23116152733564377, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 297 }, { "completion_length": 647.5416870117188, "epoch": 0.7946666666666666, "grad_norm": 30.405757904052734, "kl": 0.86474609375, "learning_rate": 2.2620593288501667e-06, "loss": 0.0116, "reward": 0.4375000074505806, "reward_std": 0.2525114119052887, "rewards/accuracy_reward": 0.4375000074505806, "rewards/format_reward": 0.0, "step": 298 }, { "completion_length": 659.2083435058594, "epoch": 0.7973333333333333, "grad_norm": 3.99859356880188, "kl": 0.583648681640625, "learning_rate": 2.256037852936525e-06, "loss": -0.0159, "reward": 0.29166667349636555, "reward_std": 0.16661179810762405, "rewards/accuracy_reward": 0.29166667349636555, "rewards/format_reward": 0.0, "step": 299 }, { "completion_length": 637.7708587646484, "epoch": 0.8, "grad_norm": 27.611656188964844, "kl": 2.47705078125, "learning_rate": 2.25e-06, "loss": 0.1998, "reward": 0.35416668839752674, "reward_std": 0.36417656019330025, "rewards/accuracy_reward": 0.35416668839752674, "rewards/format_reward": 0.0, "step": 300 }, { "completion_length": 682.1666870117188, "epoch": 0.8026666666666666, "grad_norm": 1630.4654541015625, "kl": 7.66650390625, "learning_rate": 2.243945900830413e-06, "loss": 0.1792, "reward": 0.27083334513008595, "reward_std": 0.28219256177544594, "rewards/accuracy_reward": 0.27083334513008595, "rewards/format_reward": 0.0, "step": 301 }, { "completion_length": 413.50001525878906, "epoch": 0.8053333333333333, "grad_norm": 88005.484375, "kl": 838.18359375, "learning_rate": 2.237875686569506e-06, "loss": 48.0791, "reward": 0.0833333358168602, "reward_std": 0.11949636787176132, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 302 }, { "completion_length": 614.7916870117188, "epoch": 0.808, "grad_norm": 111.8994140625, "kl": 1.419921875, "learning_rate": 2.231789488708099e-06, "loss": 0.075, "reward": 0.1041666716337204, "reward_std": 0.05103103816509247, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 303 }, { "completion_length": 629.0000305175781, "epoch": 0.8106666666666666, "grad_norm": 6898.142578125, "kl": 24.58984375, "learning_rate": 2.2256874390832447e-06, "loss": 1.1764, "reward": 0.16666667722165585, "reward_std": 0.18404607102274895, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 304 }, { "completion_length": 717.2291870117188, "epoch": 0.8133333333333334, "grad_norm": 8847.892578125, "kl": 19.443359375, "learning_rate": 2.2195696698753695e-06, "loss": 1.5009, "reward": 0.0416666679084301, "reward_std": 0.06454972922801971, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 305 }, { "completion_length": 743.3333435058594, "epoch": 0.816, "grad_norm": 458.5970458984375, "kl": 6.7021484375, "learning_rate": 2.213436313605413e-06, "loss": 0.3691, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 306 }, { "completion_length": 934.1250152587891, "epoch": 0.8186666666666667, "grad_norm": 1261.3668212890625, "kl": 20.8359375, "learning_rate": 2.2072875031319556e-06, "loss": 0.5689, "reward": 0.0416666679084301, "reward_std": 0.06454972922801971, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 307 }, { "completion_length": 545.4583435058594, "epoch": 0.8213333333333334, "grad_norm": 56254.73046875, "kl": 186.546875, "learning_rate": 2.2011233716483416e-06, "loss": 8.0071, "reward": 0.02083333395421505, "reward_std": 0.05103103816509247, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 308 }, { "completion_length": 689.2708587646484, "epoch": 0.824, "grad_norm": 992.6975708007812, "kl": 4.853515625, "learning_rate": 2.1949440526797927e-06, "loss": 0.2266, "reward": 0.22916666977107525, "reward_std": 0.2446802221238613, "rewards/accuracy_reward": 0.22916666977107525, "rewards/format_reward": 0.0, "step": 309 }, { "completion_length": 575.5000152587891, "epoch": 0.8266666666666667, "grad_norm": 14.90085506439209, "kl": 0.2930908203125, "learning_rate": 2.1887496800805174e-06, "loss": 0.0365, "reward": 0.27083333395421505, "reward_std": 0.28219255805015564, "rewards/accuracy_reward": 0.27083333395421505, "rewards/format_reward": 0.0, "step": 310 }, { "completion_length": 501.62500762939453, "epoch": 0.8293333333333334, "grad_norm": 13.801560401916504, "kl": 0.2686767578125, "learning_rate": 2.1825403880308107e-06, "loss": -0.0458, "reward": 0.5416666865348816, "reward_std": 0.16661179438233376, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "step": 311 }, { "completion_length": 567.3958587646484, "epoch": 0.832, "grad_norm": 25.853025436401367, "kl": 0.1470947265625, "learning_rate": 2.1763163110341462e-06, "loss": 0.0468, "reward": 0.5625000149011612, "reward_std": 0.235077116638422, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 312 }, { "completion_length": 664.5625076293945, "epoch": 0.8346666666666667, "grad_norm": 5866.689453125, "kl": 25.58026123046875, "learning_rate": 2.1700775839142652e-06, "loss": 1.0306, "reward": 0.7500000149011612, "reward_std": 0.2861081622540951, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 313 }, { "completion_length": 897.1667022705078, "epoch": 0.8373333333333334, "grad_norm": 2.1541504859924316, "kl": 0.10302734375, "learning_rate": 2.1638243418122534e-06, "loss": 0.0213, "reward": 0.6875000149011612, "reward_std": 0.235077116638422, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "step": 314 }, { "completion_length": 633.4166793823242, "epoch": 0.84, "grad_norm": 74721.421875, "kl": 800.21728515625, "learning_rate": 2.157556720183616e-06, "loss": 31.9069, "reward": 0.29166667722165585, "reward_std": 0.16661179810762405, "rewards/accuracy_reward": 0.29166667722165585, "rewards/format_reward": 0.0, "step": 315 }, { "completion_length": 810.0208740234375, "epoch": 0.8426666666666667, "grad_norm": 56.302738189697266, "kl": 0.1741943359375, "learning_rate": 2.151274854795342e-06, "loss": 0.0364, "reward": 0.39583334513008595, "reward_std": 0.28219256177544594, "rewards/accuracy_reward": 0.39583334513008595, "rewards/format_reward": 0.0, "step": 316 }, { "completion_length": 692.9583358764648, "epoch": 0.8453333333333334, "grad_norm": 2.8889639377593994, "kl": 0.11712646484375, "learning_rate": 2.1449788817229644e-06, "loss": 0.0226, "reward": 0.35416667722165585, "reward_std": 0.2446802221238613, "rewards/accuracy_reward": 0.35416667722165585, "rewards/format_reward": 0.0, "step": 317 }, { "completion_length": 557.2708587646484, "epoch": 0.848, "grad_norm": 59.83365249633789, "kl": 0.3212890625, "learning_rate": 2.138668937347609e-06, "loss": 0.0498, "reward": 0.5000000223517418, "reward_std": 0.2957112565636635, "rewards/accuracy_reward": 0.5000000223517418, "rewards/format_reward": 0.0, "step": 318 }, { "completion_length": 585.0417022705078, "epoch": 0.8506666666666667, "grad_norm": 57.15522384643555, "kl": 0.13848876953125, "learning_rate": 2.132345158353047e-06, "loss": -0.0406, "reward": 0.6041666716337204, "reward_std": 0.18796167895197868, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "step": 319 }, { "completion_length": 590.7916870117188, "epoch": 0.8533333333333334, "grad_norm": 1.5622169971466064, "kl": 0.115234375, "learning_rate": 2.126007681722727e-06, "loss": 0.011, "reward": 0.5208333488553762, "reward_std": 0.1801304928958416, "rewards/accuracy_reward": 0.5208333488553762, "rewards/format_reward": 0.0, "step": 320 }, { "completion_length": 554.6458587646484, "epoch": 0.856, "grad_norm": 12.661802291870117, "kl": 0.06396484375, "learning_rate": 2.119656644736813e-06, "loss": 0.0302, "reward": 0.4791666716337204, "reward_std": 0.05103103816509247, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.0, "step": 321 }, { "completion_length": 620.5625305175781, "epoch": 0.8586666666666667, "grad_norm": 0.5451918840408325, "kl": 0.12109375, "learning_rate": 2.113292184969207e-06, "loss": -0.0375, "reward": 0.5208333507180214, "reward_std": 0.28219256177544594, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "step": 322 }, { "completion_length": 674.4166793823242, "epoch": 0.8613333333333333, "grad_norm": 6.236075401306152, "kl": 0.08465576171875, "learning_rate": 2.106914440284572e-06, "loss": 0.0807, "reward": 0.3125000074505806, "reward_std": 0.36417658627033234, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "step": 323 }, { "completion_length": 529.6875152587891, "epoch": 0.864, "grad_norm": 0.7760786414146423, "kl": 0.107666015625, "learning_rate": 2.100523548835343e-06, "loss": 0.0345, "reward": 0.770833358168602, "reward_std": 0.21764283627271652, "rewards/accuracy_reward": 0.770833358168602, "rewards/format_reward": 0.0, "step": 324 }, { "completion_length": 557.3541793823242, "epoch": 0.8666666666666667, "grad_norm": 0.35026639699935913, "kl": 0.1966552734375, "learning_rate": 2.0941196490587354e-06, "loss": 0.0203, "reward": 0.291666679084301, "reward_std": 0.23899272456765175, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.0, "step": 325 }, { "completion_length": 619.5000305175781, "epoch": 0.8693333333333333, "grad_norm": 1.0790166854858398, "kl": 0.156494140625, "learning_rate": 2.0877028796737477e-06, "loss": 0.0023, "reward": 0.4166666679084301, "reward_std": 0.2957112491130829, "rewards/accuracy_reward": 0.4166666679084301, "rewards/format_reward": 0.0, "step": 326 }, { "completion_length": 806.5208587646484, "epoch": 0.872, "grad_norm": 0.9412611722946167, "kl": 0.1807861328125, "learning_rate": 2.0812733796781545e-06, "loss": 0.0285, "reward": 0.2500000074505806, "reward_std": 0.24859582632780075, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 327 }, { "completion_length": 633.2083587646484, "epoch": 0.8746666666666667, "grad_norm": 2.870098114013672, "kl": 0.091064453125, "learning_rate": 2.0748312883454963e-06, "loss": -0.0113, "reward": 0.645833358168602, "reward_std": 0.2446802221238613, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 328 }, { "completion_length": 572.8958587646484, "epoch": 0.8773333333333333, "grad_norm": 0.3554651141166687, "kl": 0.192626953125, "learning_rate": 2.068376745222062e-06, "loss": 0.0353, "reward": 0.4791666865348816, "reward_std": 0.28219255432486534, "rewards/accuracy_reward": 0.4791666865348816, "rewards/format_reward": 0.0, "step": 329 }, { "completion_length": 773.5416870117188, "epoch": 0.88, "grad_norm": 0.2810516059398651, "kl": 0.16339111328125, "learning_rate": 2.061909890123868e-06, "loss": 0.018, "reward": 0.3333333432674408, "reward_std": 0.11949635669589043, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 330 }, { "completion_length": 846.6667022705078, "epoch": 0.8826666666666667, "grad_norm": 0.6509025692939758, "kl": 0.129791259765625, "learning_rate": 2.055430863133628e-06, "loss": 0.0358, "reward": 0.4166666865348816, "reward_std": 0.2957112491130829, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.0, "step": 331 }, { "completion_length": 717.7500152587891, "epoch": 0.8853333333333333, "grad_norm": 0.8496950268745422, "kl": 0.230712890625, "learning_rate": 2.048939804597718e-06, "loss": 0.0931, "reward": 0.416666679084301, "reward_std": 0.4152075946331024, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "step": 332 }, { "completion_length": 807.5000305175781, "epoch": 0.888, "grad_norm": 0.49586477875709534, "kl": 0.10491943359375, "learning_rate": 2.0424368551231384e-06, "loss": 0.0089, "reward": 0.5208333432674408, "reward_std": 0.3170611187815666, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 333 }, { "completion_length": 501.5208511352539, "epoch": 0.8906666666666667, "grad_norm": 0.40998944640159607, "kl": 0.1203460693359375, "learning_rate": 2.035922155574466e-06, "loss": -0.0339, "reward": 0.7083333432674408, "reward_std": 0.18404608592391014, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 334 }, { "completion_length": 844.4166870117188, "epoch": 0.8933333333333333, "grad_norm": 12.733755111694336, "kl": 0.199462890625, "learning_rate": 2.0293958470708033e-06, "loss": -0.001, "reward": 0.6250000149011612, "reward_std": 0.3506578914821148, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 335 }, { "completion_length": 662.5833435058594, "epoch": 0.896, "grad_norm": 2.5601916313171387, "kl": 0.1180419921875, "learning_rate": 2.022858070982723e-06, "loss": 0.0709, "reward": 0.7083333432674408, "reward_std": 0.2861081510782242, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 336 }, { "completion_length": 858.6041870117188, "epoch": 0.8986666666666666, "grad_norm": 0.2826376259326935, "kl": 0.1029052734375, "learning_rate": 2.016308968929203e-06, "loss": 0.027, "reward": 0.5625000055879354, "reward_std": 0.21764282882213593, "rewards/accuracy_reward": 0.5625000055879354, "rewards/format_reward": 0.0, "step": 337 }, { "completion_length": 878.3125152587891, "epoch": 0.9013333333333333, "grad_norm": 0.3674981892108917, "kl": 0.203125, "learning_rate": 2.0097486827745623e-06, "loss": -0.0174, "reward": 0.4791666865348816, "reward_std": 0.3720077611505985, "rewards/accuracy_reward": 0.4791666865348816, "rewards/format_reward": 0.0, "step": 338 }, { "completion_length": 714.2083587646484, "epoch": 0.904, "grad_norm": 0.7770251631736755, "kl": 0.24627685546875, "learning_rate": 2.0031773546253826e-06, "loss": 0.0725, "reward": 0.6041666716337204, "reward_std": 0.42872630804777145, "rewards/accuracy_reward": 0.6041666716337204, "rewards/format_reward": 0.0, "step": 339 }, { "completion_length": 593.9791870117188, "epoch": 0.9066666666666666, "grad_norm": 0.45360517501831055, "kl": 0.2352294921875, "learning_rate": 1.9965951268274372e-06, "loss": 0.0406, "reward": 0.5833333507180214, "reward_std": 0.31314554437994957, "rewards/accuracy_reward": 0.5833333507180214, "rewards/format_reward": 0.0, "step": 340 }, { "completion_length": 664.5625152587891, "epoch": 0.9093333333333333, "grad_norm": 0.1532231569290161, "kl": 0.172698974609375, "learning_rate": 1.9900021419626017e-06, "loss": 0.0248, "reward": 0.2291666753590107, "reward_std": 0.1801304928958416, "rewards/accuracy_reward": 0.2291666753590107, "rewards/format_reward": 0.0, "step": 341 }, { "completion_length": 617.7083587646484, "epoch": 0.912, "grad_norm": 0.21975946426391602, "kl": 0.29150390625, "learning_rate": 1.983398542845767e-06, "loss": -0.0212, "reward": 0.645833358168602, "reward_std": 0.38161083683371544, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "step": 342 }, { "completion_length": 529.5000228881836, "epoch": 0.9146666666666666, "grad_norm": 0.44750016927719116, "kl": 0.2016754150390625, "learning_rate": 1.976784472521747e-06, "loss": 0.0377, "reward": 0.7083333432674408, "reward_std": 0.3332235999405384, "rewards/accuracy_reward": 0.7083333432674408, "rewards/format_reward": 0.0, "step": 343 }, { "completion_length": 683.6458435058594, "epoch": 0.9173333333333333, "grad_norm": 0.10972858965396881, "kl": 0.06365966796875, "learning_rate": 1.9701600742621796e-06, "loss": 0.0364, "reward": 0.6041666865348816, "reward_std": 0.1801304928958416, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 344 }, { "completion_length": 833.9791870117188, "epoch": 0.92, "grad_norm": 0.28129515051841736, "kl": 0.251007080078125, "learning_rate": 1.963525491562421e-06, "loss": 0.0688, "reward": 0.4791666716337204, "reward_std": 0.25515518710017204, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.0, "step": 345 }, { "completion_length": 591.0000228881836, "epoch": 0.9226666666666666, "grad_norm": 0.10267713665962219, "kl": 0.1419677734375, "learning_rate": 1.9568808681384415e-06, "loss": -0.028, "reward": 0.33333333395421505, "reward_std": 0.22155842557549477, "rewards/accuracy_reward": 0.33333333395421505, "rewards/format_reward": 0.0, "step": 346 }, { "completion_length": 557.1250076293945, "epoch": 0.9253333333333333, "grad_norm": 0.27456042170524597, "kl": 0.296875, "learning_rate": 1.9502263479237084e-06, "loss": -0.0547, "reward": 0.6041666865348816, "reward_std": 0.2996268570423126, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "step": 347 }, { "completion_length": 713.6875152587891, "epoch": 0.928, "grad_norm": 0.1611674427986145, "kl": 0.17510986328125, "learning_rate": 1.9435620750660703e-06, "loss": 0.022, "reward": 0.3750000074505806, "reward_std": 0.12909945845603943, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.0, "step": 348 }, { "completion_length": 541.6041793823242, "epoch": 0.9306666666666666, "grad_norm": 0.9292861819267273, "kl": 0.426513671875, "learning_rate": 1.9368881939246333e-06, "loss": 0.0624, "reward": 0.6666666865348816, "reward_std": 0.3602609783411026, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "step": 349 }, { "completion_length": 698.0625152587891, "epoch": 0.9333333333333333, "grad_norm": 0.18113847076892853, "kl": 0.34326171875, "learning_rate": 1.9302048490666355e-06, "loss": 0.0351, "reward": 0.35416667722165585, "reward_std": 0.21764283254742622, "rewards/accuracy_reward": 0.35416667722165585, "rewards/format_reward": 0.0, "step": 350 }, { "completion_length": 647.7708435058594, "epoch": 0.936, "grad_norm": 0.23358656466007233, "kl": 0.237548828125, "learning_rate": 1.923512185264315e-06, "loss": 0.0175, "reward": 0.3958333395421505, "reward_std": 0.41129202395677567, "rewards/accuracy_reward": 0.3958333395421505, "rewards/format_reward": 0.0, "step": 351 }, { "completion_length": 674.1041870117188, "epoch": 0.9386666666666666, "grad_norm": 0.2018204629421234, "kl": 0.1859283447265625, "learning_rate": 1.916810347491772e-06, "loss": 0.0137, "reward": 0.6250000149011612, "reward_std": 0.23116153478622437, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 352 }, { "completion_length": 844.1875305175781, "epoch": 0.9413333333333334, "grad_norm": 0.3149741590023041, "kl": 0.25860595703125, "learning_rate": 1.9100994809218323e-06, "loss": 0.0756, "reward": 0.37500000186264515, "reward_std": 0.3332235924899578, "rewards/accuracy_reward": 0.37500000186264515, "rewards/format_reward": 0.0, "step": 353 }, { "completion_length": 625.4583587646484, "epoch": 0.944, "grad_norm": 0.41483765840530396, "kl": 0.3572998046875, "learning_rate": 1.9033797309228985e-06, "loss": 0.1117, "reward": 0.5625000298023224, "reward_std": 0.2996268458664417, "rewards/accuracy_reward": 0.5625000298023224, "rewards/format_reward": 0.0, "step": 354 }, { "completion_length": 701.3541717529297, "epoch": 0.9466666666666667, "grad_norm": 1.9215991497039795, "kl": 0.716796875, "learning_rate": 1.8966512430558036e-06, "loss": 0.1199, "reward": 0.5625000149011612, "reward_std": 0.2350771352648735, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 355 }, { "completion_length": 874.7916946411133, "epoch": 0.9493333333333334, "grad_norm": 2.8515124320983887, "kl": 1.845062255859375, "learning_rate": 1.8899141630706564e-06, "loss": 0.1755, "reward": 0.6875000223517418, "reward_std": 0.39208584278821945, "rewards/accuracy_reward": 0.6875000223517418, "rewards/format_reward": 0.0, "step": 356 }, { "completion_length": 766.0833435058594, "epoch": 0.952, "grad_norm": 2.6256027221679688, "kl": 2.3642578125, "learning_rate": 1.8831686369036859e-06, "loss": 0.1805, "reward": 0.39583334140479565, "reward_std": 0.2350771240890026, "rewards/accuracy_reward": 0.39583334140479565, "rewards/format_reward": 0.0, "step": 357 }, { "completion_length": 714.0208511352539, "epoch": 0.9546666666666667, "grad_norm": 1.6547703742980957, "kl": 1.00885009765625, "learning_rate": 1.876414810674079e-06, "loss": 0.0457, "reward": 0.5208333432674408, "reward_std": 0.235077116638422, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "step": 358 }, { "completion_length": 609.4583587646484, "epoch": 0.9573333333333334, "grad_norm": 0.5665653944015503, "kl": 0.771728515625, "learning_rate": 1.8696528306808168e-06, "loss": 0.0998, "reward": 0.4583333432674408, "reward_std": 0.18404607102274895, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "step": 359 }, { "completion_length": 730.7291870117188, "epoch": 0.96, "grad_norm": 1.261367917060852, "kl": 0.23529052734375, "learning_rate": 1.8628828433995015e-06, "loss": 0.03, "reward": 0.6250000149011612, "reward_std": 0.22155843302607536, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "step": 360 }, { "completion_length": 808.5625305175781, "epoch": 0.9626666666666667, "grad_norm": 1.5878499746322632, "kl": 0.54681396484375, "learning_rate": 1.8561049954791895e-06, "loss": 0.0336, "reward": 0.5208333358168602, "reward_std": 0.13301505148410797, "rewards/accuracy_reward": 0.5208333358168602, "rewards/format_reward": 0.0, "step": 361 }, { "completion_length": 815.1041870117188, "epoch": 0.9653333333333334, "grad_norm": 0.3808901309967041, "kl": 0.371826171875, "learning_rate": 1.8493194337392087e-06, "loss": 0.0859, "reward": 0.541666679084301, "reward_std": 0.16661179810762405, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "step": 362 }, { "completion_length": 522.4791870117188, "epoch": 0.968, "grad_norm": 0.30578020215034485, "kl": 0.1224365234375, "learning_rate": 1.8425263051659837e-06, "loss": 0.0237, "reward": 0.5000000074505806, "reward_std": 0.18404608592391014, "rewards/accuracy_reward": 0.5000000074505806, "rewards/format_reward": 0.0, "step": 363 }, { "completion_length": 668.5000076293945, "epoch": 0.9706666666666667, "grad_norm": 0.26360616087913513, "kl": 0.136077880859375, "learning_rate": 1.8357257569098473e-06, "loss": 0.0111, "reward": 0.5208333507180214, "reward_std": 0.44616059213876724, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "step": 364 }, { "completion_length": 728.0000228881836, "epoch": 0.9733333333333334, "grad_norm": 1.045130729675293, "kl": 0.154876708984375, "learning_rate": 1.828917936281855e-06, "loss": 0.0659, "reward": 0.3333333432674408, "reward_std": 0.30354245379567146, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 365 }, { "completion_length": 753.1458587646484, "epoch": 0.976, "grad_norm": 0.5927110910415649, "kl": 0.25830078125, "learning_rate": 1.822102990750595e-06, "loss": 0.0534, "reward": 0.3958333507180214, "reward_std": 0.36417658627033234, "rewards/accuracy_reward": 0.3958333507180214, "rewards/format_reward": 0.0, "step": 366 }, { "completion_length": 731.2291870117188, "epoch": 0.9786666666666667, "grad_norm": 0.9049092531204224, "kl": 0.2169189453125, "learning_rate": 1.8152810679389911e-06, "loss": 0.1457, "reward": 0.5416666828095913, "reward_std": 0.3602609820663929, "rewards/accuracy_reward": 0.5416666828095913, "rewards/format_reward": 0.0, "step": 367 }, { "completion_length": 763.1667022705078, "epoch": 0.9813333333333333, "grad_norm": 0.8557331562042236, "kl": 0.7392578125, "learning_rate": 1.808452315621108e-06, "loss": 0.0306, "reward": 0.45833334140479565, "reward_std": 0.35848909616470337, "rewards/accuracy_reward": 0.45833334140479565, "rewards/format_reward": 0.0, "step": 368 }, { "completion_length": 432.0416793823242, "epoch": 0.984, "grad_norm": 0.7425907254219055, "kl": 0.904541015625, "learning_rate": 1.8016168817189471e-06, "loss": 0.0233, "reward": 0.7916666865348816, "reward_std": 0.3332235962152481, "rewards/accuracy_reward": 0.7916666865348816, "rewards/format_reward": 0.0, "step": 369 }, { "completion_length": 543.8125152587891, "epoch": 0.9866666666666667, "grad_norm": 1.483704686164856, "kl": 1.626220703125, "learning_rate": 1.7947749142992453e-06, "loss": 0.1434, "reward": 0.6458333507180214, "reward_std": 0.35457349941134453, "rewards/accuracy_reward": 0.6458333507180214, "rewards/format_reward": 0.0, "step": 370 }, { "completion_length": 750.6458435058594, "epoch": 0.9893333333333333, "grad_norm": 1.1710835695266724, "kl": 1.1573486328125, "learning_rate": 1.7879265615702653e-06, "loss": 0.1069, "reward": 0.4375000149011612, "reward_std": 0.38161085173487663, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "step": 371 }, { "completion_length": 821.9583435058594, "epoch": 0.992, "grad_norm": 2.896385431289673, "kl": 3.43359375, "learning_rate": 1.7810719718785873e-06, "loss": 0.193, "reward": 0.31250000558793545, "reward_std": 0.3074580393731594, "rewards/accuracy_reward": 0.31250000558793545, "rewards/format_reward": 0.0, "step": 372 }, { "completion_length": 663.1875076293945, "epoch": 0.9946666666666667, "grad_norm": 2.6439497470855713, "kl": 3.552734375, "learning_rate": 1.7742112937058924e-06, "loss": 0.1882, "reward": 0.4583333395421505, "reward_std": 0.4248107075691223, "rewards/accuracy_reward": 0.4583333395421505, "rewards/format_reward": 0.0, "step": 373 }, { "completion_length": 633.2916793823242, "epoch": 0.9973333333333333, "grad_norm": 0.7656214237213135, "kl": 0.66461181640625, "learning_rate": 1.76734467566575e-06, "loss": 0.0844, "reward": 0.7291666865348816, "reward_std": 0.21764283999800682, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 0.0, "step": 374 }, { "completion_length": 620.1041793823242, "epoch": 1.0, "grad_norm": 1.1568132638931274, "kl": 1.1962890625, "learning_rate": 1.7604722665003958e-06, "loss": 0.1247, "reward": 0.7500000149011612, "reward_std": 0.4152076169848442, "rewards/accuracy_reward": 0.7500000149011612, "rewards/format_reward": 0.0, "step": 375 }, { "epoch": 1.0, "eval_completion_length": 723.7617282104492, "eval_kl": 1.5607896118164062, "eval_loss": 0.08730700612068176, "eval_reward": 0.4726666794717312, "eval_reward_std": 0.29052443864941596, "eval_rewards/accuracy_reward": 0.4726666794717312, "eval_rewards/format_reward": 0.0, "eval_runtime": 30137.8072, "eval_samples_per_second": 0.066, "eval_steps_per_second": 0.006, "step": 375 } ], "logging_steps": 1, "max_steps": 750, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }