| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 10, | |
| "global_step": 375, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 617.9583435058594, | |
| "epoch": 0.0026666666666666666, | |
| "grad_norm": 0.5426230430603027, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": -0.2056, | |
| "reward": 0.27083334140479565, | |
| "reward_std": 0.2350771315395832, | |
| "rewards/accuracy_reward": 0.27083334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 669.7916870117188, | |
| "epoch": 0.005333333333333333, | |
| "grad_norm": 0.6748480796813965, | |
| "kl": 0.0, | |
| "learning_rate": 8e-08, | |
| "loss": -0.0475, | |
| "reward": 0.2083333395421505, | |
| "reward_std": 0.3881702348589897, | |
| "rewards/accuracy_reward": 0.2083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 896.7292022705078, | |
| "epoch": 0.008, | |
| "grad_norm": 0.4940797984600067, | |
| "kl": 0.0002243518829345703, | |
| "learning_rate": 1.2000000000000002e-07, | |
| "loss": -0.1296, | |
| "reward": 0.27083333395421505, | |
| "reward_std": 0.3842546306550503, | |
| "rewards/accuracy_reward": 0.27083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 823.6458587646484, | |
| "epoch": 0.010666666666666666, | |
| "grad_norm": 0.26322299242019653, | |
| "kl": 0.00017309188842773438, | |
| "learning_rate": 1.6e-07, | |
| "loss": -0.038, | |
| "reward": 0.1666666679084301, | |
| "reward_std": 0.23899272084236145, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 828.6875152587891, | |
| "epoch": 0.013333333333333334, | |
| "grad_norm": 0.15690098702907562, | |
| "kl": 0.0001386404037475586, | |
| "learning_rate": 2e-07, | |
| "loss": -0.1455, | |
| "reward": 0.10416666977107525, | |
| "reward_std": 0.1801304928958416, | |
| "rewards/accuracy_reward": 0.10416666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 649.8333358764648, | |
| "epoch": 0.016, | |
| "grad_norm": 0.25603243708610535, | |
| "kl": 0.00013273954391479492, | |
| "learning_rate": 2.4000000000000003e-07, | |
| "loss": -0.0906, | |
| "reward": 0.2291666716337204, | |
| "reward_std": 0.24468021839857101, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 842.6458587646484, | |
| "epoch": 0.018666666666666668, | |
| "grad_norm": 0.26339226961135864, | |
| "kl": 0.00011420249938964844, | |
| "learning_rate": 2.8e-07, | |
| "loss": -0.018, | |
| "reward": 0.0416666679084301, | |
| "reward_std": 0.10206206887960434, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 517.8750228881836, | |
| "epoch": 0.021333333333333333, | |
| "grad_norm": 0.21927940845489502, | |
| "kl": 0.0001316070556640625, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0214, | |
| "reward": 0.2083333358168602, | |
| "reward_std": 0.3061862252652645, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 605.8541717529297, | |
| "epoch": 0.024, | |
| "grad_norm": 0.29087916016578674, | |
| "kl": 0.00013589859008789062, | |
| "learning_rate": 3.6e-07, | |
| "loss": -0.0332, | |
| "reward": 0.20833333767950535, | |
| "reward_std": 0.38817023858428, | |
| "rewards/accuracy_reward": 0.20833333767950535, | |
| "rewards/format_reward": 0.0, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 787.3333435058594, | |
| "epoch": 0.02666666666666667, | |
| "grad_norm": 0.12677079439163208, | |
| "kl": 0.0001424551010131836, | |
| "learning_rate": 4e-07, | |
| "loss": -0.0164, | |
| "reward": 0.1458333395421505, | |
| "reward_std": 0.1530931033194065, | |
| "rewards/accuracy_reward": 0.1458333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 715.2916870117188, | |
| "epoch": 0.029333333333333333, | |
| "grad_norm": 0.196958988904953, | |
| "kl": 0.00014650821685791016, | |
| "learning_rate": 4.4e-07, | |
| "loss": 0.0302, | |
| "reward": 0.12500000558793545, | |
| "reward_std": 0.18404607474803925, | |
| "rewards/accuracy_reward": 0.12500000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 685.3125305175781, | |
| "epoch": 0.032, | |
| "grad_norm": 0.2892319858074188, | |
| "kl": 0.0001569986343383789, | |
| "learning_rate": 4.800000000000001e-07, | |
| "loss": -0.0853, | |
| "reward": 0.2083333395421505, | |
| "reward_std": 0.3881702348589897, | |
| "rewards/accuracy_reward": 0.2083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 687.5000305175781, | |
| "epoch": 0.034666666666666665, | |
| "grad_norm": 0.20977553725242615, | |
| "kl": 0.0001035928726196289, | |
| "learning_rate": 5.2e-07, | |
| "loss": -0.0293, | |
| "reward": 0.1250000037252903, | |
| "reward_std": 0.16661180183291435, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 721.3125152587891, | |
| "epoch": 0.037333333333333336, | |
| "grad_norm": 0.43697115778923035, | |
| "kl": 0.00019097328186035156, | |
| "learning_rate": 5.6e-07, | |
| "loss": -0.0344, | |
| "reward": 0.31250001303851604, | |
| "reward_std": 0.36417657509446144, | |
| "rewards/accuracy_reward": 0.31250001303851604, | |
| "rewards/format_reward": 0.0, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 815.6250152587891, | |
| "epoch": 0.04, | |
| "grad_norm": 0.30842292308807373, | |
| "kl": 0.00019025802612304688, | |
| "learning_rate": 6.000000000000001e-07, | |
| "loss": 0.0081, | |
| "reward": 0.25000000931322575, | |
| "reward_std": 0.3332235962152481, | |
| "rewards/accuracy_reward": 0.25000000931322575, | |
| "rewards/format_reward": 0.0, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 714.8750152587891, | |
| "epoch": 0.042666666666666665, | |
| "grad_norm": 0.15875642001628876, | |
| "kl": 0.00021457672119140625, | |
| "learning_rate": 6.4e-07, | |
| "loss": -0.069, | |
| "reward": 0.2083333358168602, | |
| "reward_std": 0.16661180183291435, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 713.6458587646484, | |
| "epoch": 0.04533333333333334, | |
| "grad_norm": 0.19242540001869202, | |
| "kl": 0.00024271011352539062, | |
| "learning_rate": 6.8e-07, | |
| "loss": 0.0165, | |
| "reward": 0.22916666977107525, | |
| "reward_std": 0.35457348451018333, | |
| "rewards/accuracy_reward": 0.22916666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 591.5208435058594, | |
| "epoch": 0.048, | |
| "grad_norm": 0.2873741686344147, | |
| "kl": 0.00020933151245117188, | |
| "learning_rate": 7.2e-07, | |
| "loss": -0.0329, | |
| "reward": 0.22916667349636555, | |
| "reward_std": 0.40168894082307816, | |
| "rewards/accuracy_reward": 0.22916667349636555, | |
| "rewards/format_reward": 0.0, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 622.2083587646484, | |
| "epoch": 0.050666666666666665, | |
| "grad_norm": 0.20909227430820465, | |
| "kl": 0.00016427040100097656, | |
| "learning_rate": 7.600000000000001e-07, | |
| "loss": 0.1103, | |
| "reward": 0.27083333767950535, | |
| "reward_std": 0.2446802221238613, | |
| "rewards/accuracy_reward": 0.27083333767950535, | |
| "rewards/format_reward": 0.0, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 561.9166793823242, | |
| "epoch": 0.05333333333333334, | |
| "grad_norm": 1.7326514720916748, | |
| "kl": 0.003941774368286133, | |
| "learning_rate": 8e-07, | |
| "loss": -0.0138, | |
| "reward": 0.1458333358168602, | |
| "reward_std": 0.2350771278142929, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 919.6250152587891, | |
| "epoch": 0.056, | |
| "grad_norm": 0.15656666457653046, | |
| "kl": 0.0001838207244873047, | |
| "learning_rate": 8.400000000000001e-07, | |
| "loss": -0.02, | |
| "reward": 0.25000001303851604, | |
| "reward_std": 0.3332235999405384, | |
| "rewards/accuracy_reward": 0.25000001303851604, | |
| "rewards/format_reward": 0.0, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 662.7500267028809, | |
| "epoch": 0.058666666666666666, | |
| "grad_norm": 0.6273130178451538, | |
| "kl": 0.0004911422729492188, | |
| "learning_rate": 8.8e-07, | |
| "loss": 0.0228, | |
| "reward": 0.2291666753590107, | |
| "reward_std": 0.31970490887761116, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 724.3958511352539, | |
| "epoch": 0.06133333333333333, | |
| "grad_norm": 0.3087979257106781, | |
| "kl": 0.00020515918731689453, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0192, | |
| "reward": 0.1458333395421505, | |
| "reward_std": 0.23507710918784142, | |
| "rewards/accuracy_reward": 0.1458333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 584.0416870117188, | |
| "epoch": 0.064, | |
| "grad_norm": 0.4410843253135681, | |
| "kl": 0.0004787445068359375, | |
| "learning_rate": 9.600000000000001e-07, | |
| "loss": -0.0191, | |
| "reward": 0.2916666716337204, | |
| "reward_std": 0.4701542556285858, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 542.3125076293945, | |
| "epoch": 0.06666666666666667, | |
| "grad_norm": 0.6044087409973145, | |
| "kl": 0.0002796649932861328, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0329, | |
| "reward": 0.27083334140479565, | |
| "reward_std": 0.3720077611505985, | |
| "rewards/accuracy_reward": 0.27083334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 769.7083587646484, | |
| "epoch": 0.06933333333333333, | |
| "grad_norm": 0.25316932797431946, | |
| "kl": 0.00023126602172851562, | |
| "learning_rate": 1.04e-06, | |
| "loss": 0.0189, | |
| "reward": 0.1666666679084301, | |
| "reward_std": 0.23116152733564377, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 767.7916870117188, | |
| "epoch": 0.072, | |
| "grad_norm": 0.1700117290019989, | |
| "kl": 0.0002732276916503906, | |
| "learning_rate": 1.08e-06, | |
| "loss": -0.0018, | |
| "reward": 0.1458333395421505, | |
| "reward_std": 0.1530931033194065, | |
| "rewards/accuracy_reward": 0.1458333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 730.8333435058594, | |
| "epoch": 0.07466666666666667, | |
| "grad_norm": 0.3627185821533203, | |
| "kl": 0.0003504753112792969, | |
| "learning_rate": 1.12e-06, | |
| "loss": 0.0534, | |
| "reward": 0.0833333358168602, | |
| "reward_std": 0.20412414893507957, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 638.6666793823242, | |
| "epoch": 0.07733333333333334, | |
| "grad_norm": 0.23711198568344116, | |
| "kl": 0.0003572702407836914, | |
| "learning_rate": 1.16e-06, | |
| "loss": 0.0077, | |
| "reward": 0.12500000186264515, | |
| "reward_std": 0.22155842557549477, | |
| "rewards/accuracy_reward": 0.12500000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 576.9375228881836, | |
| "epoch": 0.08, | |
| "grad_norm": 0.1860937625169754, | |
| "kl": 0.0005288124084472656, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "loss": -0.0037, | |
| "reward": 0.12500000186264515, | |
| "reward_std": 0.18404608964920044, | |
| "rewards/accuracy_reward": 0.12500000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 769.3958511352539, | |
| "epoch": 0.08266666666666667, | |
| "grad_norm": 0.19105114042758942, | |
| "kl": 0.0011196136474609375, | |
| "learning_rate": 1.24e-06, | |
| "loss": 0.1389, | |
| "reward": 0.4375000074505806, | |
| "reward_std": 0.43655750155448914, | |
| "rewards/accuracy_reward": 0.4375000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 853.6042022705078, | |
| "epoch": 0.08533333333333333, | |
| "grad_norm": 0.22852593660354614, | |
| "kl": 0.0007238388061523438, | |
| "learning_rate": 1.28e-06, | |
| "loss": 0.0189, | |
| "reward": 0.1875000037252903, | |
| "reward_std": 0.33713920414447784, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 778.4791870117188, | |
| "epoch": 0.088, | |
| "grad_norm": 0.23918747901916504, | |
| "kl": 0.0009222030639648438, | |
| "learning_rate": 1.32e-06, | |
| "loss": -0.0058, | |
| "reward": 0.3958333469927311, | |
| "reward_std": 0.3816108703613281, | |
| "rewards/accuracy_reward": 0.3958333469927311, | |
| "rewards/format_reward": 0.0, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 704.6041793823242, | |
| "epoch": 0.09066666666666667, | |
| "grad_norm": 0.4135127365589142, | |
| "kl": 0.001522064208984375, | |
| "learning_rate": 1.36e-06, | |
| "loss": 0.1236, | |
| "reward": 0.1666666679084301, | |
| "reward_std": 0.3332235887646675, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 599.6458511352539, | |
| "epoch": 0.09333333333333334, | |
| "grad_norm": 0.3384934365749359, | |
| "kl": 0.0020999908447265625, | |
| "learning_rate": 1.4000000000000001e-06, | |
| "loss": -0.1167, | |
| "reward": 0.1666666716337204, | |
| "reward_std": 0.2861081697046757, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 932.8750305175781, | |
| "epoch": 0.096, | |
| "grad_norm": 0.32682985067367554, | |
| "kl": 0.0016765594482421875, | |
| "learning_rate": 1.44e-06, | |
| "loss": 0.0254, | |
| "reward": 0.3750000111758709, | |
| "reward_std": 0.4326418787240982, | |
| "rewards/accuracy_reward": 0.3750000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 539.9583587646484, | |
| "epoch": 0.09866666666666667, | |
| "grad_norm": 1.6405223608016968, | |
| "kl": 0.003711700439453125, | |
| "learning_rate": 1.48e-06, | |
| "loss": -0.0445, | |
| "reward": 0.3541666716337204, | |
| "reward_std": 0.3816108778119087, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 570.0625228881836, | |
| "epoch": 0.10133333333333333, | |
| "grad_norm": 0.28974005579948425, | |
| "kl": 0.004131317138671875, | |
| "learning_rate": 1.5200000000000003e-06, | |
| "loss": -0.0492, | |
| "reward": 0.2500000037252903, | |
| "reward_std": 0.2957112640142441, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 653.6666946411133, | |
| "epoch": 0.104, | |
| "grad_norm": 0.15624871850013733, | |
| "kl": 0.00244903564453125, | |
| "learning_rate": 1.56e-06, | |
| "loss": -0.0274, | |
| "reward": 0.1666666716337204, | |
| "reward_std": 0.20148035883903503, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 980.7708587646484, | |
| "epoch": 0.10666666666666667, | |
| "grad_norm": 0.23032425343990326, | |
| "kl": 0.002117156982421875, | |
| "learning_rate": 1.6e-06, | |
| "loss": 0.105, | |
| "reward": 0.3958333544433117, | |
| "reward_std": 0.42872628569602966, | |
| "rewards/accuracy_reward": 0.3958333544433117, | |
| "rewards/format_reward": 0.0, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 779.8750305175781, | |
| "epoch": 0.10933333333333334, | |
| "grad_norm": 0.19505877792835236, | |
| "kl": 0.005481719970703125, | |
| "learning_rate": 1.64e-06, | |
| "loss": 0.0268, | |
| "reward": 0.31250000558793545, | |
| "reward_std": 0.3720077611505985, | |
| "rewards/accuracy_reward": 0.31250000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 731.0416870117188, | |
| "epoch": 0.112, | |
| "grad_norm": 0.15505914390087128, | |
| "kl": 0.004322052001953125, | |
| "learning_rate": 1.6800000000000002e-06, | |
| "loss": -0.014, | |
| "reward": 0.5000000074505806, | |
| "reward_std": 0.18404608964920044, | |
| "rewards/accuracy_reward": 0.5000000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 688.4583587646484, | |
| "epoch": 0.11466666666666667, | |
| "grad_norm": 0.4499289393424988, | |
| "kl": 0.00275421142578125, | |
| "learning_rate": 1.72e-06, | |
| "loss": 0.0817, | |
| "reward": 0.5416666865348816, | |
| "reward_std": 0.23116152361035347, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 882.5000305175781, | |
| "epoch": 0.11733333333333333, | |
| "grad_norm": 0.12532763183116913, | |
| "kl": 0.003265380859375, | |
| "learning_rate": 1.76e-06, | |
| "loss": 0.1742, | |
| "reward": 0.3750000149011612, | |
| "reward_std": 0.3776952587068081, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 962.8125305175781, | |
| "epoch": 0.12, | |
| "grad_norm": 0.11625898629426956, | |
| "kl": 0.001895904541015625, | |
| "learning_rate": 1.8e-06, | |
| "loss": 0.1109, | |
| "reward": 0.2708333395421505, | |
| "reward_std": 0.2996268458664417, | |
| "rewards/accuracy_reward": 0.2708333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 841.7500152587891, | |
| "epoch": 0.12266666666666666, | |
| "grad_norm": 0.05917806923389435, | |
| "kl": 0.004627227783203125, | |
| "learning_rate": 1.84e-06, | |
| "loss": 0.0264, | |
| "reward": 0.1250000037252903, | |
| "reward_std": 0.12909945845603943, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 960.7500305175781, | |
| "epoch": 0.12533333333333332, | |
| "grad_norm": 0.12586216628551483, | |
| "kl": 0.0055255889892578125, | |
| "learning_rate": 1.8800000000000002e-06, | |
| "loss": 0.0527, | |
| "reward": 0.35416667722165585, | |
| "reward_std": 0.2996268607676029, | |
| "rewards/accuracy_reward": 0.35416667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 838.5417022705078, | |
| "epoch": 0.128, | |
| "grad_norm": 0.10650404542684555, | |
| "kl": 0.00315093994140625, | |
| "learning_rate": 1.9200000000000003e-06, | |
| "loss": 0.1018, | |
| "reward": 0.3750000111758709, | |
| "reward_std": 0.24859581142663956, | |
| "rewards/accuracy_reward": 0.3750000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 726.0416870117188, | |
| "epoch": 0.13066666666666665, | |
| "grad_norm": 0.11485293507575989, | |
| "kl": 0.008026123046875, | |
| "learning_rate": 1.96e-06, | |
| "loss": 0.0354, | |
| "reward": 0.47916667722165585, | |
| "reward_std": 0.38161085173487663, | |
| "rewards/accuracy_reward": 0.47916667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 829.6250457763672, | |
| "epoch": 0.13333333333333333, | |
| "grad_norm": 0.1546422690153122, | |
| "kl": 0.0053253173828125, | |
| "learning_rate": 2e-06, | |
| "loss": 0.005, | |
| "reward": 0.6041666716337204, | |
| "reward_std": 0.37377967685461044, | |
| "rewards/accuracy_reward": 0.6041666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 670.8333435058594, | |
| "epoch": 0.136, | |
| "grad_norm": 0.16615261137485504, | |
| "kl": 0.0036163330078125, | |
| "learning_rate": 2.0400000000000004e-06, | |
| "loss": -0.0384, | |
| "reward": 0.416666679084301, | |
| "reward_std": 0.3131455332040787, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 822.1250152587891, | |
| "epoch": 0.13866666666666666, | |
| "grad_norm": 0.32312434911727905, | |
| "kl": 0.00707244873046875, | |
| "learning_rate": 2.08e-06, | |
| "loss": -0.0019, | |
| "reward": 0.2083333358168602, | |
| "reward_std": 0.30354245007038116, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 964.8541946411133, | |
| "epoch": 0.14133333333333334, | |
| "grad_norm": 0.08396324515342712, | |
| "kl": 0.003337860107421875, | |
| "learning_rate": 2.12e-06, | |
| "loss": 0.0572, | |
| "reward": 0.3958333395421505, | |
| "reward_std": 0.2446802258491516, | |
| "rewards/accuracy_reward": 0.3958333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 1055.0417175292969, | |
| "epoch": 0.144, | |
| "grad_norm": 0.074210025370121, | |
| "kl": 0.0041351318359375, | |
| "learning_rate": 2.16e-06, | |
| "loss": 0.0675, | |
| "reward": 0.29166667349636555, | |
| "reward_std": 0.16661180183291435, | |
| "rewards/accuracy_reward": 0.29166667349636555, | |
| "rewards/format_reward": 0.0, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 551.2083435058594, | |
| "epoch": 0.14666666666666667, | |
| "grad_norm": 0.1495818793773651, | |
| "kl": 0.007686614990234375, | |
| "learning_rate": 2.1999999999999997e-06, | |
| "loss": 0.0052, | |
| "reward": 0.4375000149011612, | |
| "reward_std": 0.1530931144952774, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 519.0833358764648, | |
| "epoch": 0.14933333333333335, | |
| "grad_norm": 0.15317903459072113, | |
| "kl": 0.0075836181640625, | |
| "learning_rate": 2.24e-06, | |
| "loss": 0.0578, | |
| "reward": 0.4375000186264515, | |
| "reward_std": 0.41912320256233215, | |
| "rewards/accuracy_reward": 0.4375000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 972.2292175292969, | |
| "epoch": 0.152, | |
| "grad_norm": 0.11835772544145584, | |
| "kl": 0.003032684326171875, | |
| "learning_rate": 2.28e-06, | |
| "loss": 0.1167, | |
| "reward": 0.4375000149011612, | |
| "reward_std": 0.33713919669389725, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 793.3958435058594, | |
| "epoch": 0.15466666666666667, | |
| "grad_norm": 0.12818466126918793, | |
| "kl": 0.002803802490234375, | |
| "learning_rate": 2.32e-06, | |
| "loss": -0.0492, | |
| "reward": 0.6250000149011612, | |
| "reward_std": 0.2686738632619381, | |
| "rewards/accuracy_reward": 0.6250000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 668.3750152587891, | |
| "epoch": 0.15733333333333333, | |
| "grad_norm": 0.11275894194841385, | |
| "kl": 0.003559112548828125, | |
| "learning_rate": 2.36e-06, | |
| "loss": 0.0205, | |
| "reward": 0.5625000149011612, | |
| "reward_std": 0.34674228727817535, | |
| "rewards/accuracy_reward": 0.5625000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 989.7292175292969, | |
| "epoch": 0.16, | |
| "grad_norm": 0.1831459105014801, | |
| "kl": 0.003498077392578125, | |
| "learning_rate": 2.4000000000000003e-06, | |
| "loss": 0.0121, | |
| "reward": 0.4583333432674408, | |
| "reward_std": 0.31314554065465927, | |
| "rewards/accuracy_reward": 0.4583333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 904.0833740234375, | |
| "epoch": 0.16266666666666665, | |
| "grad_norm": 0.08020555973052979, | |
| "kl": 0.00287628173828125, | |
| "learning_rate": 2.44e-06, | |
| "loss": 0.024, | |
| "reward": 0.5000000149011612, | |
| "reward_std": 0.30354243889451027, | |
| "rewards/accuracy_reward": 0.5000000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 806.1875305175781, | |
| "epoch": 0.16533333333333333, | |
| "grad_norm": 0.3277391791343689, | |
| "kl": 0.0061187744140625, | |
| "learning_rate": 2.48e-06, | |
| "loss": 0.0241, | |
| "reward": 0.5625000149011612, | |
| "reward_std": 0.33713919296860695, | |
| "rewards/accuracy_reward": 0.5625000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 600.7500076293945, | |
| "epoch": 0.168, | |
| "grad_norm": 0.2129485160112381, | |
| "kl": 0.008419036865234375, | |
| "learning_rate": 2.52e-06, | |
| "loss": 0.115, | |
| "reward": 0.5625000223517418, | |
| "reward_std": 0.28219257295131683, | |
| "rewards/accuracy_reward": 0.5625000223517418, | |
| "rewards/format_reward": 0.0, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 771.1458740234375, | |
| "epoch": 0.17066666666666666, | |
| "grad_norm": 0.28386905789375305, | |
| "kl": 0.00501251220703125, | |
| "learning_rate": 2.56e-06, | |
| "loss": 0.0106, | |
| "reward": 0.6041666865348816, | |
| "reward_std": 0.42872628569602966, | |
| "rewards/accuracy_reward": 0.6041666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 778.8750228881836, | |
| "epoch": 0.17333333333333334, | |
| "grad_norm": 0.10419953614473343, | |
| "kl": 0.009777069091796875, | |
| "learning_rate": 2.6e-06, | |
| "loss": 0.0624, | |
| "reward": 0.7916666865348816, | |
| "reward_std": 0.2861081622540951, | |
| "rewards/accuracy_reward": 0.7916666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 861.1875457763672, | |
| "epoch": 0.176, | |
| "grad_norm": 0.2341865748167038, | |
| "kl": 0.00921630859375, | |
| "learning_rate": 2.64e-06, | |
| "loss": 0.0133, | |
| "reward": 0.31250000558793545, | |
| "reward_std": 0.2996268570423126, | |
| "rewards/accuracy_reward": 0.31250000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 694.6875305175781, | |
| "epoch": 0.17866666666666667, | |
| "grad_norm": 0.11757036298513412, | |
| "kl": 0.013874053955078125, | |
| "learning_rate": 2.68e-06, | |
| "loss": 0.0182, | |
| "reward": 0.708333358168602, | |
| "reward_std": 0.2861081659793854, | |
| "rewards/accuracy_reward": 0.708333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 933.8750305175781, | |
| "epoch": 0.18133333333333335, | |
| "grad_norm": 0.13943161070346832, | |
| "kl": 0.012493133544921875, | |
| "learning_rate": 2.72e-06, | |
| "loss": 0.1693, | |
| "reward": 0.41666667722165585, | |
| "reward_std": 0.2861081510782242, | |
| "rewards/accuracy_reward": 0.41666667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 890.3333587646484, | |
| "epoch": 0.184, | |
| "grad_norm": 0.09695959836244583, | |
| "kl": 0.00414276123046875, | |
| "learning_rate": 2.7600000000000003e-06, | |
| "loss": 0.0099, | |
| "reward": 0.27083334885537624, | |
| "reward_std": 0.23507710918784142, | |
| "rewards/accuracy_reward": 0.27083334885537624, | |
| "rewards/format_reward": 0.0, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 740.6250305175781, | |
| "epoch": 0.18666666666666668, | |
| "grad_norm": 0.13296610116958618, | |
| "kl": 0.00751495361328125, | |
| "learning_rate": 2.8000000000000003e-06, | |
| "loss": 0.0852, | |
| "reward": 0.541666679084301, | |
| "reward_std": 0.37592337280511856, | |
| "rewards/accuracy_reward": 0.541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 655.1041870117188, | |
| "epoch": 0.18933333333333333, | |
| "grad_norm": 0.11237625777721405, | |
| "kl": 0.008686065673828125, | |
| "learning_rate": 2.84e-06, | |
| "loss": 0.0895, | |
| "reward": 0.6875000298023224, | |
| "reward_std": 0.28219255805015564, | |
| "rewards/accuracy_reward": 0.6875000298023224, | |
| "rewards/format_reward": 0.0, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 862.8333435058594, | |
| "epoch": 0.192, | |
| "grad_norm": 0.18952777981758118, | |
| "kl": 0.0099639892578125, | |
| "learning_rate": 2.88e-06, | |
| "loss": 0.1253, | |
| "reward": 0.5833333432674408, | |
| "reward_std": 0.4230388179421425, | |
| "rewards/accuracy_reward": 0.5833333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 639.8333587646484, | |
| "epoch": 0.19466666666666665, | |
| "grad_norm": 0.09035161137580872, | |
| "kl": 0.00690460205078125, | |
| "learning_rate": 2.9200000000000004e-06, | |
| "loss": 0.0498, | |
| "reward": 0.4166666716337204, | |
| "reward_std": 0.25642700120806694, | |
| "rewards/accuracy_reward": 0.4166666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 793.6875305175781, | |
| "epoch": 0.19733333333333333, | |
| "grad_norm": 0.10165846347808838, | |
| "kl": 0.00469970703125, | |
| "learning_rate": 2.96e-06, | |
| "loss": 0.0626, | |
| "reward": 0.6250000111758709, | |
| "reward_std": 0.31314554065465927, | |
| "rewards/accuracy_reward": 0.6250000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 896.1250152587891, | |
| "epoch": 0.2, | |
| "grad_norm": 0.12082868069410324, | |
| "kl": 0.004482269287109375, | |
| "learning_rate": 3e-06, | |
| "loss": -0.0443, | |
| "reward": 0.3958333358168602, | |
| "reward_std": 0.28219256550073624, | |
| "rewards/accuracy_reward": 0.3958333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 525.1875152587891, | |
| "epoch": 0.20266666666666666, | |
| "grad_norm": 0.21093720197677612, | |
| "kl": 0.01087188720703125, | |
| "learning_rate": 2.9999837537669383e-06, | |
| "loss": 0.0263, | |
| "reward": 0.604166679084301, | |
| "reward_std": 0.2996268570423126, | |
| "rewards/accuracy_reward": 0.604166679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 632.3541717529297, | |
| "epoch": 0.20533333333333334, | |
| "grad_norm": 0.09489479660987854, | |
| "kl": 0.00710296630859375, | |
| "learning_rate": 2.9999350154196726e-06, | |
| "loss": 0.0416, | |
| "reward": 0.6875000298023224, | |
| "reward_std": 0.21764283254742622, | |
| "rewards/accuracy_reward": 0.6875000298023224, | |
| "rewards/format_reward": 0.0, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 835.8125152587891, | |
| "epoch": 0.208, | |
| "grad_norm": 0.11504478007555008, | |
| "kl": 0.00841522216796875, | |
| "learning_rate": 2.9998537860139563e-06, | |
| "loss": 0.0233, | |
| "reward": 0.5416666865348816, | |
| "reward_std": 0.4152076058089733, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 860.5000305175781, | |
| "epoch": 0.21066666666666667, | |
| "grad_norm": 0.07925013452768326, | |
| "kl": 0.00824737548828125, | |
| "learning_rate": 2.9997400673093517e-06, | |
| "loss": 0.0732, | |
| "reward": 0.5000000111758709, | |
| "reward_std": 0.26603008806705475, | |
| "rewards/accuracy_reward": 0.5000000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 933.6458587646484, | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 0.1141030564904213, | |
| "kl": 0.009246826171875, | |
| "learning_rate": 2.9995938617691924e-06, | |
| "loss": -0.0376, | |
| "reward": 0.2916666716337204, | |
| "reward_std": 0.20148037374019623, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 606.5416870117188, | |
| "epoch": 0.216, | |
| "grad_norm": 0.09949828684329987, | |
| "kl": 0.0077667236328125, | |
| "learning_rate": 2.9994151725605313e-06, | |
| "loss": 0.1411, | |
| "reward": 0.5416666716337204, | |
| "reward_std": 0.25642701238393784, | |
| "rewards/accuracy_reward": 0.5416666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 759.7500305175781, | |
| "epoch": 0.21866666666666668, | |
| "grad_norm": 0.11169271171092987, | |
| "kl": 0.00763702392578125, | |
| "learning_rate": 2.9992040035540708e-06, | |
| "loss": 0.0378, | |
| "reward": 0.6458333432674408, | |
| "reward_std": 0.35457348451018333, | |
| "rewards/accuracy_reward": 0.6458333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 885.8958587646484, | |
| "epoch": 0.22133333333333333, | |
| "grad_norm": 0.09573396295309067, | |
| "kl": 0.005886077880859375, | |
| "learning_rate": 2.9989603593240777e-06, | |
| "loss": 0.1027, | |
| "reward": 0.4583333507180214, | |
| "reward_std": 0.23116152733564377, | |
| "rewards/accuracy_reward": 0.4583333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 844.2291870117188, | |
| "epoch": 0.224, | |
| "grad_norm": 0.11840051412582397, | |
| "kl": 0.004993438720703125, | |
| "learning_rate": 2.9986842451482876e-06, | |
| "loss": 0.0166, | |
| "reward": 0.6458333507180214, | |
| "reward_std": 0.317061148583889, | |
| "rewards/accuracy_reward": 0.6458333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 699.4166870117188, | |
| "epoch": 0.22666666666666666, | |
| "grad_norm": 0.575175940990448, | |
| "kl": 0.00882720947265625, | |
| "learning_rate": 2.998375667007787e-06, | |
| "loss": 0.1395, | |
| "reward": 0.5833333507180214, | |
| "reward_std": 0.24859580025076866, | |
| "rewards/accuracy_reward": 0.5833333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 752.3333435058594, | |
| "epoch": 0.22933333333333333, | |
| "grad_norm": 0.08685300499200821, | |
| "kl": 0.00833892822265625, | |
| "learning_rate": 2.9980346315868857e-06, | |
| "loss": -0.0384, | |
| "reward": 0.3750000074505806, | |
| "reward_std": 0.18404608592391014, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 849.0000305175781, | |
| "epoch": 0.232, | |
| "grad_norm": 0.08769199252128601, | |
| "kl": 0.0060882568359375, | |
| "learning_rate": 2.9976611462729716e-06, | |
| "loss": -0.036, | |
| "reward": 0.3958333432674408, | |
| "reward_std": 0.309229951351881, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 692.1666946411133, | |
| "epoch": 0.23466666666666666, | |
| "grad_norm": 0.2641507387161255, | |
| "kl": 0.0601959228515625, | |
| "learning_rate": 2.997255219156351e-06, | |
| "loss": -0.0153, | |
| "reward": 0.5000000149011612, | |
| "reward_std": 0.10206207260489464, | |
| "rewards/accuracy_reward": 0.5000000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 733.9791946411133, | |
| "epoch": 0.23733333333333334, | |
| "grad_norm": 0.11071512848138809, | |
| "kl": 0.0061798095703125, | |
| "learning_rate": 2.996816859030072e-06, | |
| "loss": 0.023, | |
| "reward": 0.33333333395421505, | |
| "reward_std": 0.22155842557549477, | |
| "rewards/accuracy_reward": 0.33333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 776.4583587646484, | |
| "epoch": 0.24, | |
| "grad_norm": 0.08652577549219131, | |
| "kl": 0.005710601806640625, | |
| "learning_rate": 2.9963460753897363e-06, | |
| "loss": 0.0425, | |
| "reward": 0.6041666865348816, | |
| "reward_std": 0.1530931070446968, | |
| "rewards/accuracy_reward": 0.6041666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 766.8333435058594, | |
| "epoch": 0.24266666666666667, | |
| "grad_norm": 0.13474039733409882, | |
| "kl": 0.00852203369140625, | |
| "learning_rate": 2.9958428784332913e-06, | |
| "loss": 0.0211, | |
| "reward": 0.5833333432674408, | |
| "reward_std": 0.3332235962152481, | |
| "rewards/accuracy_reward": 0.5833333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 817.7500152587891, | |
| "epoch": 0.24533333333333332, | |
| "grad_norm": 1.9350175857543945, | |
| "kl": 0.00949859619140625, | |
| "learning_rate": 2.995307279060811e-06, | |
| "loss": 0.105, | |
| "reward": 0.4375000074505806, | |
| "reward_std": 0.2996268458664417, | |
| "rewards/accuracy_reward": 0.4375000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 599.6875152587891, | |
| "epoch": 0.248, | |
| "grad_norm": 0.1610657274723053, | |
| "kl": 0.013885498046875, | |
| "learning_rate": 2.9947392888742567e-06, | |
| "loss": 0.0217, | |
| "reward": 0.6041666716337204, | |
| "reward_std": 0.11558075994253159, | |
| "rewards/accuracy_reward": 0.6041666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 626.25, | |
| "epoch": 0.25066666666666665, | |
| "grad_norm": 0.19253714382648468, | |
| "kl": 0.01007843017578125, | |
| "learning_rate": 2.994138920177231e-06, | |
| "loss": 0.0233, | |
| "reward": 0.583333358168602, | |
| "reward_std": 0.16661179810762405, | |
| "rewards/accuracy_reward": 0.583333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 913.4167022705078, | |
| "epoch": 0.25333333333333335, | |
| "grad_norm": 0.2549319863319397, | |
| "kl": 0.010101318359375, | |
| "learning_rate": 2.9935061859747068e-06, | |
| "loss": 0.0697, | |
| "reward": 0.41666668467223644, | |
| "reward_std": 0.36809216812253, | |
| "rewards/accuracy_reward": 0.41666668467223644, | |
| "rewards/format_reward": 0.0, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 851.0416870117188, | |
| "epoch": 0.256, | |
| "grad_norm": 0.09697781503200531, | |
| "kl": 0.007923126220703125, | |
| "learning_rate": 2.9928410999727467e-06, | |
| "loss": 0.0469, | |
| "reward": 0.5416666716337204, | |
| "reward_std": 0.22155842557549477, | |
| "rewards/accuracy_reward": 0.5416666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 1129.5000305175781, | |
| "epoch": 0.25866666666666666, | |
| "grad_norm": 0.1845501810312271, | |
| "kl": 0.00640869140625, | |
| "learning_rate": 2.9921436765782077e-06, | |
| "loss": 0.0713, | |
| "reward": 0.5208333432674408, | |
| "reward_std": 0.2996268533170223, | |
| "rewards/accuracy_reward": 0.5208333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 841.6875305175781, | |
| "epoch": 0.2613333333333333, | |
| "grad_norm": 0.1793762743473053, | |
| "kl": 0.0106964111328125, | |
| "learning_rate": 2.9914139308984264e-06, | |
| "loss": 0.0075, | |
| "reward": 0.479166679084301, | |
| "reward_std": 0.28219256550073624, | |
| "rewards/accuracy_reward": 0.479166679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 668.7291717529297, | |
| "epoch": 0.264, | |
| "grad_norm": 0.2460280954837799, | |
| "kl": 0.0189361572265625, | |
| "learning_rate": 2.9906518787408948e-06, | |
| "loss": 0.0203, | |
| "reward": 0.3958333507180214, | |
| "reward_std": 0.33713920041918755, | |
| "rewards/accuracy_reward": 0.3958333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 811.4166870117188, | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 0.16320976614952087, | |
| "kl": 0.0098114013671875, | |
| "learning_rate": 2.989857536612915e-06, | |
| "loss": 0.0632, | |
| "reward": 0.4375000149011612, | |
| "reward_std": 0.2900237590074539, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 876.5000305175781, | |
| "epoch": 0.2693333333333333, | |
| "grad_norm": 2.9081761837005615, | |
| "kl": 0.05722808837890625, | |
| "learning_rate": 2.989030921721243e-06, | |
| "loss": 0.0033, | |
| "reward": 0.5, | |
| "reward_std": 0.3602609783411026, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 0.0, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 856.0833435058594, | |
| "epoch": 0.272, | |
| "grad_norm": 0.12382116168737411, | |
| "kl": 0.0175018310546875, | |
| "learning_rate": 2.988172051971717e-06, | |
| "loss": 0.0418, | |
| "reward": 0.4166666865348816, | |
| "reward_std": 0.20412414520978928, | |
| "rewards/accuracy_reward": 0.4166666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 866.6667022705078, | |
| "epoch": 0.27466666666666667, | |
| "grad_norm": 0.10861078649759293, | |
| "kl": 0.01019287109375, | |
| "learning_rate": 2.9872809459688676e-06, | |
| "loss": 0.0183, | |
| "reward": 0.520833358168602, | |
| "reward_std": 0.33713918179273605, | |
| "rewards/accuracy_reward": 0.520833358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 757.8333587646484, | |
| "epoch": 0.2773333333333333, | |
| "grad_norm": 0.13254211843013763, | |
| "kl": 0.02797698974609375, | |
| "learning_rate": 2.986357623015516e-06, | |
| "loss": 0.0117, | |
| "reward": 0.541666679084301, | |
| "reward_std": 0.19364918768405914, | |
| "rewards/accuracy_reward": 0.541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 749.7500305175781, | |
| "epoch": 0.28, | |
| "grad_norm": 0.18112321197986603, | |
| "kl": 0.0172882080078125, | |
| "learning_rate": 2.9854021031123555e-06, | |
| "loss": 0.0549, | |
| "reward": 0.7291666865348816, | |
| "reward_std": 0.27258947119116783, | |
| "rewards/accuracy_reward": 0.7291666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 610.9583511352539, | |
| "epoch": 0.2826666666666667, | |
| "grad_norm": 3.0257515907287598, | |
| "kl": 0.10125732421875, | |
| "learning_rate": 2.984414406957518e-06, | |
| "loss": 0.183, | |
| "reward": 0.3333333469927311, | |
| "reward_std": 0.2686738818883896, | |
| "rewards/accuracy_reward": 0.3333333469927311, | |
| "rewards/format_reward": 0.0, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 779.4375228881836, | |
| "epoch": 0.2853333333333333, | |
| "grad_norm": 0.8972102403640747, | |
| "kl": 0.0275115966796875, | |
| "learning_rate": 2.983394555946126e-06, | |
| "loss": -0.0191, | |
| "reward": 0.6250000298023224, | |
| "reward_std": 0.3131455294787884, | |
| "rewards/accuracy_reward": 0.6250000298023224, | |
| "rewards/format_reward": 0.0, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 613.2708587646484, | |
| "epoch": 0.288, | |
| "grad_norm": 0.301284521818161, | |
| "kl": 0.0101165771484375, | |
| "learning_rate": 2.9823425721698293e-06, | |
| "loss": 0.0303, | |
| "reward": 0.5625, | |
| "reward_std": 0.11558075994253159, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 0.0, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 808.8542022705078, | |
| "epoch": 0.2906666666666667, | |
| "grad_norm": 0.15024465322494507, | |
| "kl": 0.01076507568359375, | |
| "learning_rate": 2.9812584784163257e-06, | |
| "loss": 0.0379, | |
| "reward": 0.4791666716337204, | |
| "reward_std": 0.2996268570423126, | |
| "rewards/accuracy_reward": 0.4791666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 610.6666946411133, | |
| "epoch": 0.29333333333333333, | |
| "grad_norm": 0.2403380125761032, | |
| "kl": 0.021087646484375, | |
| "learning_rate": 2.980142298168869e-06, | |
| "loss": 0.0448, | |
| "reward": 0.5416666716337204, | |
| "reward_std": 0.47975732386112213, | |
| "rewards/accuracy_reward": 0.5416666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 838.0208587646484, | |
| "epoch": 0.296, | |
| "grad_norm": 0.2263801097869873, | |
| "kl": 0.0153045654296875, | |
| "learning_rate": 2.9789940556057576e-06, | |
| "loss": -0.0202, | |
| "reward": 0.583333358168602, | |
| "reward_std": 0.3602609820663929, | |
| "rewards/accuracy_reward": 0.583333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 993.0833435058594, | |
| "epoch": 0.2986666666666667, | |
| "grad_norm": 1.6176540851593018, | |
| "kl": 0.027130126953125, | |
| "learning_rate": 2.9778137755998135e-06, | |
| "loss": -0.023, | |
| "reward": 0.2500000074505806, | |
| "reward_std": 0.18404609709978104, | |
| "rewards/accuracy_reward": 0.2500000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 701.2500152587891, | |
| "epoch": 0.30133333333333334, | |
| "grad_norm": 0.2460828274488449, | |
| "kl": 0.0155029296875, | |
| "learning_rate": 2.9766014837178418e-06, | |
| "loss": 0.0559, | |
| "reward": 0.6875000149011612, | |
| "reward_std": 0.2350771278142929, | |
| "rewards/accuracy_reward": 0.6875000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 851.958366394043, | |
| "epoch": 0.304, | |
| "grad_norm": 0.1806672066450119, | |
| "kl": 0.01204681396484375, | |
| "learning_rate": 2.975357206220079e-06, | |
| "loss": 0.0004, | |
| "reward": 0.6458333432674408, | |
| "reward_std": 0.2350771203637123, | |
| "rewards/accuracy_reward": 0.6458333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 853.6042022705078, | |
| "epoch": 0.30666666666666664, | |
| "grad_norm": 0.355673223733902, | |
| "kl": 0.0250244140625, | |
| "learning_rate": 2.97408097005962e-06, | |
| "loss": 0.1069, | |
| "reward": 0.5208333358168602, | |
| "reward_std": 0.34674229472875595, | |
| "rewards/accuracy_reward": 0.5208333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 547.3333511352539, | |
| "epoch": 0.30933333333333335, | |
| "grad_norm": 0.09819953143596649, | |
| "kl": 0.00971221923828125, | |
| "learning_rate": 2.9727728028818388e-06, | |
| "loss": 0.0728, | |
| "reward": 0.8958333432674408, | |
| "reward_std": 0.1705273948609829, | |
| "rewards/accuracy_reward": 0.8958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 937.7708587646484, | |
| "epoch": 0.312, | |
| "grad_norm": 0.1053454652428627, | |
| "kl": 0.010650634765625, | |
| "learning_rate": 2.9714327330237873e-06, | |
| "loss": 0.0229, | |
| "reward": 0.5000000037252903, | |
| "reward_std": 0.12909945845603943, | |
| "rewards/accuracy_reward": 0.5000000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 622.8958435058594, | |
| "epoch": 0.31466666666666665, | |
| "grad_norm": 0.1517428457736969, | |
| "kl": 0.013092041015625, | |
| "learning_rate": 2.970060789513582e-06, | |
| "loss": 0.0359, | |
| "reward": 0.8125000149011612, | |
| "reward_std": 0.235077116638422, | |
| "rewards/accuracy_reward": 0.8125000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 873.0625305175781, | |
| "epoch": 0.31733333333333336, | |
| "grad_norm": 0.147451251745224, | |
| "kl": 0.016143798828125, | |
| "learning_rate": 2.968657002069774e-06, | |
| "loss": 0.0268, | |
| "reward": 0.6250000223517418, | |
| "reward_std": 0.3776952587068081, | |
| "rewards/accuracy_reward": 0.6250000223517418, | |
| "rewards/format_reward": 0.0, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 754.4375305175781, | |
| "epoch": 0.32, | |
| "grad_norm": 0.16505473852157593, | |
| "kl": 0.010101318359375, | |
| "learning_rate": 2.9672214011007086e-06, | |
| "loss": 0.0973, | |
| "reward": 0.6250000149011612, | |
| "reward_std": 0.4056045264005661, | |
| "rewards/accuracy_reward": 0.6250000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 904.4791870117188, | |
| "epoch": 0.32266666666666666, | |
| "grad_norm": 0.1410188376903534, | |
| "kl": 0.01914215087890625, | |
| "learning_rate": 2.965754017703862e-06, | |
| "loss": 0.0569, | |
| "reward": 0.35416666977107525, | |
| "reward_std": 0.2525114119052887, | |
| "rewards/accuracy_reward": 0.35416666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 792.5000152587891, | |
| "epoch": 0.3253333333333333, | |
| "grad_norm": 0.3636722266674042, | |
| "kl": 0.013519287109375, | |
| "learning_rate": 2.9642548836651712e-06, | |
| "loss": 0.0447, | |
| "reward": 0.6875000149011612, | |
| "reward_std": 0.33713918551802635, | |
| "rewards/accuracy_reward": 0.6875000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 664.6666870117188, | |
| "epoch": 0.328, | |
| "grad_norm": 0.09268064051866531, | |
| "kl": 0.0204925537109375, | |
| "learning_rate": 2.962724031458345e-06, | |
| "loss": 0.0351, | |
| "reward": 0.6250000149011612, | |
| "reward_std": 0.12909945845603943, | |
| "rewards/accuracy_reward": 0.6250000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 865.5208587646484, | |
| "epoch": 0.33066666666666666, | |
| "grad_norm": 0.1581239253282547, | |
| "kl": 0.0137176513671875, | |
| "learning_rate": 2.9611614942441577e-06, | |
| "loss": 0.0515, | |
| "reward": 0.2708333432674408, | |
| "reward_std": 0.2996268644928932, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 803.2500305175781, | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.19644340872764587, | |
| "kl": 0.02410888671875, | |
| "learning_rate": 2.959567305869736e-06, | |
| "loss": 0.0212, | |
| "reward": 0.6875000149011612, | |
| "reward_std": 0.33713921159505844, | |
| "rewards/accuracy_reward": 0.6875000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 845.5000152587891, | |
| "epoch": 0.336, | |
| "grad_norm": 0.5170478820800781, | |
| "kl": 0.01678466796875, | |
| "learning_rate": 2.95794150086782e-06, | |
| "loss": 0.1042, | |
| "reward": 0.5208333432674408, | |
| "reward_std": 0.43655747920274734, | |
| "rewards/accuracy_reward": 0.5208333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 817.9791870117188, | |
| "epoch": 0.33866666666666667, | |
| "grad_norm": 0.08609090745449066, | |
| "kl": 0.0124053955078125, | |
| "learning_rate": 2.956284114456018e-06, | |
| "loss": 0.065, | |
| "reward": 0.2500000074505806, | |
| "reward_std": 0.286108173429966, | |
| "rewards/accuracy_reward": 0.2500000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 765.5000152587891, | |
| "epoch": 0.3413333333333333, | |
| "grad_norm": 0.14322727918624878, | |
| "kl": 0.0160369873046875, | |
| "learning_rate": 2.9545951825360466e-06, | |
| "loss": 0.0176, | |
| "reward": 0.604166679084301, | |
| "reward_std": 0.38161086291074753, | |
| "rewards/accuracy_reward": 0.604166679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 702.2708587646484, | |
| "epoch": 0.344, | |
| "grad_norm": 0.1747395098209381, | |
| "kl": 0.017333984375, | |
| "learning_rate": 2.9528747416929465e-06, | |
| "loss": -0.0379, | |
| "reward": 0.4583333395421505, | |
| "reward_std": 0.18404609709978104, | |
| "rewards/accuracy_reward": 0.4583333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 650.2916870117188, | |
| "epoch": 0.3466666666666667, | |
| "grad_norm": 0.07288848608732224, | |
| "kl": 0.0129547119140625, | |
| "learning_rate": 2.951122829194296e-06, | |
| "loss": 0.0248, | |
| "reward": 0.7083333395421505, | |
| "reward_std": 0.18404608592391014, | |
| "rewards/accuracy_reward": 0.7083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 641.8333587646484, | |
| "epoch": 0.34933333333333333, | |
| "grad_norm": 0.23963476717472076, | |
| "kl": 0.0562896728515625, | |
| "learning_rate": 2.9493394829893994e-06, | |
| "loss": 0.009, | |
| "reward": 0.6666667014360428, | |
| "reward_std": 0.23116152733564377, | |
| "rewards/accuracy_reward": 0.6666667014360428, | |
| "rewards/format_reward": 0.0, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 847.9583587646484, | |
| "epoch": 0.352, | |
| "grad_norm": 0.15714174509048462, | |
| "kl": 0.02581787109375, | |
| "learning_rate": 2.9475247417084673e-06, | |
| "loss": -0.0092, | |
| "reward": 0.5416666865348816, | |
| "reward_std": 0.22155843675136566, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 609.7708587646484, | |
| "epoch": 0.3546666666666667, | |
| "grad_norm": 0.15627437829971313, | |
| "kl": 0.021453857421875, | |
| "learning_rate": 2.9456786446617797e-06, | |
| "loss": 0.0034, | |
| "reward": 0.5208333395421505, | |
| "reward_std": 0.2900237515568733, | |
| "rewards/accuracy_reward": 0.5208333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 604.6250305175781, | |
| "epoch": 0.35733333333333334, | |
| "grad_norm": 0.2316051423549652, | |
| "kl": 0.0161590576171875, | |
| "learning_rate": 2.9438012318388337e-06, | |
| "loss": -0.0564, | |
| "reward": 0.6250000260770321, | |
| "reward_std": 0.23116153106093407, | |
| "rewards/accuracy_reward": 0.6250000260770321, | |
| "rewards/format_reward": 0.0, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 487.64585876464844, | |
| "epoch": 0.36, | |
| "grad_norm": 0.2416388839483261, | |
| "kl": 0.0159759521484375, | |
| "learning_rate": 2.9418925439074784e-06, | |
| "loss": 0.0365, | |
| "reward": 0.6458333432674408, | |
| "reward_std": 0.21764282882213593, | |
| "rewards/accuracy_reward": 0.6458333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 736.3750152587891, | |
| "epoch": 0.3626666666666667, | |
| "grad_norm": 0.11874468624591827, | |
| "kl": 0.0135955810546875, | |
| "learning_rate": 2.9399526222130314e-06, | |
| "loss": 0.0148, | |
| "reward": 0.5625000111758709, | |
| "reward_std": 0.299626849591732, | |
| "rewards/accuracy_reward": 0.5625000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 864.3333740234375, | |
| "epoch": 0.36533333333333334, | |
| "grad_norm": 0.21116961538791656, | |
| "kl": 0.0140228271484375, | |
| "learning_rate": 2.9379815087773864e-06, | |
| "loss": 0.0897, | |
| "reward": 0.5625000298023224, | |
| "reward_std": 0.31970490142703056, | |
| "rewards/accuracy_reward": 0.5625000298023224, | |
| "rewards/format_reward": 0.0, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 718.7708435058594, | |
| "epoch": 0.368, | |
| "grad_norm": 0.13117057085037231, | |
| "kl": 0.0204620361328125, | |
| "learning_rate": 2.9359792462981008e-06, | |
| "loss": -0.0376, | |
| "reward": 0.7083333432674408, | |
| "reward_std": 0.10206206887960434, | |
| "rewards/accuracy_reward": 0.7083333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 740.6875305175781, | |
| "epoch": 0.37066666666666664, | |
| "grad_norm": 0.09841669350862503, | |
| "kl": 0.011810302734375, | |
| "learning_rate": 2.9339458781474724e-06, | |
| "loss": 0.0257, | |
| "reward": 0.7291666865348816, | |
| "reward_std": 0.2350771278142929, | |
| "rewards/accuracy_reward": 0.7291666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 568.1250305175781, | |
| "epoch": 0.37333333333333335, | |
| "grad_norm": 0.1539030224084854, | |
| "kl": 0.0248260498046875, | |
| "learning_rate": 2.9318814483715983e-06, | |
| "loss": -0.0729, | |
| "reward": 0.45833334140479565, | |
| "reward_std": 0.24859581515192986, | |
| "rewards/accuracy_reward": 0.45833334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 805.770881652832, | |
| "epoch": 0.376, | |
| "grad_norm": 0.17137649655342102, | |
| "kl": 0.017730712890625, | |
| "learning_rate": 2.9297860016894203e-06, | |
| "loss": 0.0541, | |
| "reward": 0.5208333488553762, | |
| "reward_std": 0.25515518337488174, | |
| "rewards/accuracy_reward": 0.5208333488553762, | |
| "rewards/format_reward": 0.0, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 761.3333511352539, | |
| "epoch": 0.37866666666666665, | |
| "grad_norm": 0.13019512593746185, | |
| "kl": 0.0190277099609375, | |
| "learning_rate": 2.9276595834917606e-06, | |
| "loss": -0.0356, | |
| "reward": 0.2708333432674408, | |
| "reward_std": 0.2525113932788372, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 707.4791793823242, | |
| "epoch": 0.38133333333333336, | |
| "grad_norm": 0.12470466643571854, | |
| "kl": 0.018096923828125, | |
| "learning_rate": 2.925502239840332e-06, | |
| "loss": 0.0384, | |
| "reward": 0.5208333432674408, | |
| "reward_std": 0.2621144950389862, | |
| "rewards/accuracy_reward": 0.5208333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 721.7500152587891, | |
| "epoch": 0.384, | |
| "grad_norm": 0.18000547587871552, | |
| "kl": 0.02191162109375, | |
| "learning_rate": 2.9233140174667447e-06, | |
| "loss": 0.0561, | |
| "reward": 0.5208333488553762, | |
| "reward_std": 0.23507710918784142, | |
| "rewards/accuracy_reward": 0.5208333488553762, | |
| "rewards/format_reward": 0.0, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 726.9791717529297, | |
| "epoch": 0.38666666666666666, | |
| "grad_norm": 0.11175241321325302, | |
| "kl": 0.0316162109375, | |
| "learning_rate": 2.921094963771494e-06, | |
| "loss": 0.0123, | |
| "reward": 0.7083333432674408, | |
| "reward_std": 0.18404608219861984, | |
| "rewards/accuracy_reward": 0.7083333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 760.1458435058594, | |
| "epoch": 0.3893333333333333, | |
| "grad_norm": 0.21677181124687195, | |
| "kl": 0.022491455078125, | |
| "learning_rate": 2.9188451268229305e-06, | |
| "loss": 0.0114, | |
| "reward": 0.5000000204890966, | |
| "reward_std": 0.350657869130373, | |
| "rewards/accuracy_reward": 0.5000000204890966, | |
| "rewards/format_reward": 0.0, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 754.7083435058594, | |
| "epoch": 0.392, | |
| "grad_norm": 0.14654108881950378, | |
| "kl": 0.020782470703125, | |
| "learning_rate": 2.9165645553562214e-06, | |
| "loss": 0.0552, | |
| "reward": 0.4791666716337204, | |
| "reward_std": 0.1530931107699871, | |
| "rewards/accuracy_reward": 0.4791666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 544.9583435058594, | |
| "epoch": 0.39466666666666667, | |
| "grad_norm": 0.1994720995426178, | |
| "kl": 0.02850341796875, | |
| "learning_rate": 2.914253298772295e-06, | |
| "loss": -0.0785, | |
| "reward": 0.6875000298023224, | |
| "reward_std": 0.36417656019330025, | |
| "rewards/accuracy_reward": 0.6875000298023224, | |
| "rewards/format_reward": 0.0, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 903.3125305175781, | |
| "epoch": 0.3973333333333333, | |
| "grad_norm": 0.13350743055343628, | |
| "kl": 0.016937255859375, | |
| "learning_rate": 2.9119114071367674e-06, | |
| "loss": -0.0053, | |
| "reward": 0.3541666716337204, | |
| "reward_std": 0.317061148583889, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 850.8750152587891, | |
| "epoch": 0.4, | |
| "grad_norm": 0.10395243018865585, | |
| "kl": 0.01261138916015625, | |
| "learning_rate": 2.9095389311788626e-06, | |
| "loss": -0.0109, | |
| "reward": 0.5416666828095913, | |
| "reward_std": 0.23116153106093407, | |
| "rewards/accuracy_reward": 0.5416666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 578.9583435058594, | |
| "epoch": 0.4026666666666667, | |
| "grad_norm": 0.07950767129659653, | |
| "kl": 0.0180816650390625, | |
| "learning_rate": 2.9071359222903105e-06, | |
| "loss": 0.0049, | |
| "reward": 0.7916666865348816, | |
| "reward_std": 0.16661180183291435, | |
| "rewards/accuracy_reward": 0.7916666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 856.2708435058594, | |
| "epoch": 0.4053333333333333, | |
| "grad_norm": 0.11328104138374329, | |
| "kl": 0.02142333984375, | |
| "learning_rate": 2.9047024325242336e-06, | |
| "loss": 0.0096, | |
| "reward": 0.291666679084301, | |
| "reward_std": 0.19364918768405914, | |
| "rewards/accuracy_reward": 0.291666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 688.9166870117188, | |
| "epoch": 0.408, | |
| "grad_norm": 0.18654842674732208, | |
| "kl": 0.019317626953125, | |
| "learning_rate": 2.9022385145940218e-06, | |
| "loss": 0.0605, | |
| "reward": 0.6875000298023224, | |
| "reward_std": 0.38161083683371544, | |
| "rewards/accuracy_reward": 0.6875000298023224, | |
| "rewards/format_reward": 0.0, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 811.1250152587891, | |
| "epoch": 0.4106666666666667, | |
| "grad_norm": 0.16071945428848267, | |
| "kl": 0.01629638671875, | |
| "learning_rate": 2.899744221872188e-06, | |
| "loss": 0.0683, | |
| "reward": 0.6250000223517418, | |
| "reward_std": 0.3776952847838402, | |
| "rewards/accuracy_reward": 0.6250000223517418, | |
| "rewards/format_reward": 0.0, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 681.9375152587891, | |
| "epoch": 0.41333333333333333, | |
| "grad_norm": 0.24948441982269287, | |
| "kl": 0.02459716796875, | |
| "learning_rate": 2.8972196083892137e-06, | |
| "loss": 0.1382, | |
| "reward": 0.4583333507180214, | |
| "reward_std": 0.30354245379567146, | |
| "rewards/accuracy_reward": 0.4583333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 705.625, | |
| "epoch": 0.416, | |
| "grad_norm": 0.40736380219459534, | |
| "kl": 0.018951416015625, | |
| "learning_rate": 2.894664728832377e-06, | |
| "loss": 0.0694, | |
| "reward": 0.6458333432674408, | |
| "reward_std": 0.33713918551802635, | |
| "rewards/accuracy_reward": 0.6458333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 583.4375228881836, | |
| "epoch": 0.4186666666666667, | |
| "grad_norm": 0.19216611981391907, | |
| "kl": 0.0235443115234375, | |
| "learning_rate": 2.8920796385445705e-06, | |
| "loss": 0.1121, | |
| "reward": 0.6250000149011612, | |
| "reward_std": 0.22155842557549477, | |
| "rewards/accuracy_reward": 0.6250000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 743.3125152587891, | |
| "epoch": 0.42133333333333334, | |
| "grad_norm": 0.5183671712875366, | |
| "kl": 0.02801513671875, | |
| "learning_rate": 2.889464393523099e-06, | |
| "loss": -0.0522, | |
| "reward": 0.6250000223517418, | |
| "reward_std": 0.36809216812253, | |
| "rewards/accuracy_reward": 0.6250000223517418, | |
| "rewards/format_reward": 0.0, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 910.2083740234375, | |
| "epoch": 0.424, | |
| "grad_norm": 0.146419957280159, | |
| "kl": 0.02410888671875, | |
| "learning_rate": 2.8868190504184698e-06, | |
| "loss": -0.0069, | |
| "reward": 0.2500000074505806, | |
| "reward_std": 0.3236205168068409, | |
| "rewards/accuracy_reward": 0.2500000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 737.9791793823242, | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 0.2719399034976959, | |
| "kl": 0.023284912109375, | |
| "learning_rate": 2.8841436665331635e-06, | |
| "loss": 0.0355, | |
| "reward": 0.541666679084301, | |
| "reward_std": 0.2686738818883896, | |
| "rewards/accuracy_reward": 0.541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 564.8125076293945, | |
| "epoch": 0.42933333333333334, | |
| "grad_norm": 0.13706326484680176, | |
| "kl": 0.0176849365234375, | |
| "learning_rate": 2.881438299820394e-06, | |
| "loss": 0.0708, | |
| "reward": 0.8541666865348816, | |
| "reward_std": 0.19756478071212769, | |
| "rewards/accuracy_reward": 0.8541666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 583.9375152587891, | |
| "epoch": 0.432, | |
| "grad_norm": 0.2149602472782135, | |
| "kl": 0.0257720947265625, | |
| "learning_rate": 2.878703008882852e-06, | |
| "loss": 0.0253, | |
| "reward": 0.5833333637565374, | |
| "reward_std": 0.3506578765809536, | |
| "rewards/accuracy_reward": 0.5833333637565374, | |
| "rewards/format_reward": 0.0, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 768.9375305175781, | |
| "epoch": 0.43466666666666665, | |
| "grad_norm": 0.1322740614414215, | |
| "kl": 0.026580810546875, | |
| "learning_rate": 2.8759378529714358e-06, | |
| "loss": -0.0072, | |
| "reward": 0.5625000149011612, | |
| "reward_std": 0.2350771203637123, | |
| "rewards/accuracy_reward": 0.5625000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 866.8333740234375, | |
| "epoch": 0.43733333333333335, | |
| "grad_norm": 0.27988889813423157, | |
| "kl": 0.0343017578125, | |
| "learning_rate": 2.8731428919839684e-06, | |
| "loss": 0.0259, | |
| "reward": 0.4583333395421505, | |
| "reward_std": 0.3680921792984009, | |
| "rewards/accuracy_reward": 0.4583333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 720.4166870117188, | |
| "epoch": 0.44, | |
| "grad_norm": 0.29093390703201294, | |
| "kl": 0.040069580078125, | |
| "learning_rate": 2.8703181864639013e-06, | |
| "loss": -0.0024, | |
| "reward": 0.645833358168602, | |
| "reward_std": 0.2996268570423126, | |
| "rewards/accuracy_reward": 0.645833358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 830.4791870117188, | |
| "epoch": 0.44266666666666665, | |
| "grad_norm": 0.21840965747833252, | |
| "kl": 0.04143524169921875, | |
| "learning_rate": 2.867463797598999e-06, | |
| "loss": 0.1342, | |
| "reward": 0.708333358168602, | |
| "reward_std": 0.415207602083683, | |
| "rewards/accuracy_reward": 0.708333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 838.8125152587891, | |
| "epoch": 0.44533333333333336, | |
| "grad_norm": 0.13068810105323792, | |
| "kl": 0.055908203125, | |
| "learning_rate": 2.8645797872200178e-06, | |
| "loss": -0.0275, | |
| "reward": 0.5833333432674408, | |
| "reward_std": 0.31314554065465927, | |
| "rewards/accuracy_reward": 0.5833333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 860.8125305175781, | |
| "epoch": 0.448, | |
| "grad_norm": 0.232852965593338, | |
| "kl": 0.0428466796875, | |
| "learning_rate": 2.861666217799363e-06, | |
| "loss": 0.0165, | |
| "reward": 0.4375000149011612, | |
| "reward_std": 0.31970490515232086, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 885.5208587646484, | |
| "epoch": 0.45066666666666666, | |
| "grad_norm": 0.34133827686309814, | |
| "kl": 0.032501220703125, | |
| "learning_rate": 2.8587231524497397e-06, | |
| "loss": 0.0144, | |
| "reward": 0.6250000223517418, | |
| "reward_std": 0.3602609820663929, | |
| "rewards/accuracy_reward": 0.6250000223517418, | |
| "rewards/format_reward": 0.0, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 701.1666793823242, | |
| "epoch": 0.4533333333333333, | |
| "grad_norm": 0.10134287923574448, | |
| "kl": 0.0167388916015625, | |
| "learning_rate": 2.855750654922781e-06, | |
| "loss": 0.05, | |
| "reward": 0.833333358168602, | |
| "reward_std": 0.23116152361035347, | |
| "rewards/accuracy_reward": 0.833333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 909.3125457763672, | |
| "epoch": 0.456, | |
| "grad_norm": 0.1822938621044159, | |
| "kl": 0.02960205078125, | |
| "learning_rate": 2.852748789607671e-06, | |
| "loss": 0.1012, | |
| "reward": 0.6041666865348816, | |
| "reward_std": 0.40168893337249756, | |
| "rewards/accuracy_reward": 0.6041666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 1057.0625305175781, | |
| "epoch": 0.45866666666666667, | |
| "grad_norm": 0.15899233520030975, | |
| "kl": 0.057373046875, | |
| "learning_rate": 2.8497176215297474e-06, | |
| "loss": 0.0381, | |
| "reward": 0.3958333432674408, | |
| "reward_std": 0.44616060703992844, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 730.3750305175781, | |
| "epoch": 0.4613333333333333, | |
| "grad_norm": 2.6546502113342285, | |
| "kl": 0.094482421875, | |
| "learning_rate": 2.846657216349094e-06, | |
| "loss": 0.1087, | |
| "reward": 0.4583333432674408, | |
| "reward_std": 0.30354245752096176, | |
| "rewards/accuracy_reward": 0.4583333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 993.375, | |
| "epoch": 0.464, | |
| "grad_norm": 0.15241017937660217, | |
| "kl": 0.0701904296875, | |
| "learning_rate": 2.8435676403591196e-06, | |
| "loss": 0.038, | |
| "reward": 0.45833334885537624, | |
| "reward_std": 0.2861081510782242, | |
| "rewards/accuracy_reward": 0.45833334885537624, | |
| "rewards/format_reward": 0.0, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 867.5208587646484, | |
| "epoch": 0.4666666666666667, | |
| "grad_norm": 0.32228943705558777, | |
| "kl": 0.03955078125, | |
| "learning_rate": 2.8404489604851183e-06, | |
| "loss": 0.0841, | |
| "reward": 0.5625000074505806, | |
| "reward_std": 0.38161084800958633, | |
| "rewards/accuracy_reward": 0.5625000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 930.9167022705078, | |
| "epoch": 0.4693333333333333, | |
| "grad_norm": 0.4391748607158661, | |
| "kl": 0.055755615234375, | |
| "learning_rate": 2.837301244282825e-06, | |
| "loss": 0.072, | |
| "reward": 0.7291666865348816, | |
| "reward_std": 0.334495410323143, | |
| "rewards/accuracy_reward": 0.7291666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 959.8125, | |
| "epoch": 0.472, | |
| "grad_norm": 0.30479004979133606, | |
| "kl": 0.08367919921875, | |
| "learning_rate": 2.8341245599369467e-06, | |
| "loss": 0.1316, | |
| "reward": 0.4375000111758709, | |
| "reward_std": 0.33713920041918755, | |
| "rewards/accuracy_reward": 0.4375000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 784.5208587646484, | |
| "epoch": 0.4746666666666667, | |
| "grad_norm": 0.4404268264770508, | |
| "kl": 0.41534423828125, | |
| "learning_rate": 2.830918976259689e-06, | |
| "loss": 0.0321, | |
| "reward": 0.7291666716337204, | |
| "reward_std": 0.235077116638422, | |
| "rewards/accuracy_reward": 0.7291666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 704.6875076293945, | |
| "epoch": 0.47733333333333333, | |
| "grad_norm": 0.21096909046173096, | |
| "kl": 0.09588623046875, | |
| "learning_rate": 2.827684562689265e-06, | |
| "loss": -0.0393, | |
| "reward": 0.6250000298023224, | |
| "reward_std": 0.20412414148449898, | |
| "rewards/accuracy_reward": 0.6250000298023224, | |
| "rewards/format_reward": 0.0, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 905.7708587646484, | |
| "epoch": 0.48, | |
| "grad_norm": 0.3012356758117676, | |
| "kl": 0.159423828125, | |
| "learning_rate": 2.8244213892883906e-06, | |
| "loss": 0.1138, | |
| "reward": 0.45833333395421505, | |
| "reward_std": 0.32362050563097, | |
| "rewards/accuracy_reward": 0.45833333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 786.2292022705078, | |
| "epoch": 0.4826666666666667, | |
| "grad_norm": 0.3507545292377472, | |
| "kl": 0.203125, | |
| "learning_rate": 2.821129526742766e-06, | |
| "loss": 0.1097, | |
| "reward": 0.4583333432674408, | |
| "reward_std": 0.32097672671079636, | |
| "rewards/accuracy_reward": 0.4583333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 819.5000305175781, | |
| "epoch": 0.48533333333333334, | |
| "grad_norm": 0.15901023149490356, | |
| "kl": 0.1199951171875, | |
| "learning_rate": 2.8178090463595464e-06, | |
| "loss": 0.0023, | |
| "reward": 0.4166666716337204, | |
| "reward_std": 0.20148037374019623, | |
| "rewards/accuracy_reward": 0.4166666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 763.6041870117188, | |
| "epoch": 0.488, | |
| "grad_norm": 0.9456762671470642, | |
| "kl": 0.3079833984375, | |
| "learning_rate": 2.814460020065795e-06, | |
| "loss": -0.0257, | |
| "reward": 0.5416666865348816, | |
| "reward_std": 0.26603008806705475, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 770.3125152587891, | |
| "epoch": 0.49066666666666664, | |
| "grad_norm": 0.6205459833145142, | |
| "kl": 0.220458984375, | |
| "learning_rate": 2.8110825204069292e-06, | |
| "loss": 0.0208, | |
| "reward": 0.5000000223517418, | |
| "reward_std": 0.4230387955904007, | |
| "rewards/accuracy_reward": 0.5000000223517418, | |
| "rewards/format_reward": 0.0, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 920.4166870117188, | |
| "epoch": 0.49333333333333335, | |
| "grad_norm": 0.23448574542999268, | |
| "kl": 0.085784912109375, | |
| "learning_rate": 2.8076766205451433e-06, | |
| "loss": 0.1141, | |
| "reward": 0.3750000074505806, | |
| "reward_std": 0.3680921792984009, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 823.4375305175781, | |
| "epoch": 0.496, | |
| "grad_norm": 0.1786121129989624, | |
| "kl": 0.04027557373046875, | |
| "learning_rate": 2.8042423942578284e-06, | |
| "loss": 0.0038, | |
| "reward": 0.6041666828095913, | |
| "reward_std": 0.28219256177544594, | |
| "rewards/accuracy_reward": 0.6041666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 816.6041870117188, | |
| "epoch": 0.49866666666666665, | |
| "grad_norm": 0.18998931348323822, | |
| "kl": 0.22528076171875, | |
| "learning_rate": 2.800779915935972e-06, | |
| "loss": 0.0081, | |
| "reward": 0.5625000204890966, | |
| "reward_std": 0.2350771240890026, | |
| "rewards/accuracy_reward": 0.5625000204890966, | |
| "rewards/format_reward": 0.0, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 1057.0833587646484, | |
| "epoch": 0.5013333333333333, | |
| "grad_norm": 0.6255596280097961, | |
| "kl": 0.224365234375, | |
| "learning_rate": 2.7972892605825464e-06, | |
| "loss": 0.0974, | |
| "reward": 0.3750000149011612, | |
| "reward_std": 0.24859579652547836, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 768.5416870117188, | |
| "epoch": 0.504, | |
| "grad_norm": 0.36918389797210693, | |
| "kl": 0.1812286376953125, | |
| "learning_rate": 2.7937705038108863e-06, | |
| "loss": -0.0044, | |
| "reward": 0.3958333469927311, | |
| "reward_std": 0.36417658627033234, | |
| "rewards/accuracy_reward": 0.3958333469927311, | |
| "rewards/format_reward": 0.0, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 756.9375152587891, | |
| "epoch": 0.5066666666666667, | |
| "grad_norm": 1.1205483675003052, | |
| "kl": 0.2040252685546875, | |
| "learning_rate": 2.7902237218430485e-06, | |
| "loss": 0.2227, | |
| "reward": 0.6875000298023224, | |
| "reward_std": 0.36417656391859055, | |
| "rewards/accuracy_reward": 0.6875000298023224, | |
| "rewards/format_reward": 0.0, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 843.4166870117188, | |
| "epoch": 0.5093333333333333, | |
| "grad_norm": 0.6259863972663879, | |
| "kl": 0.201416015625, | |
| "learning_rate": 2.7866489915081606e-06, | |
| "loss": 0.1422, | |
| "reward": 0.3958333544433117, | |
| "reward_std": 0.38161084800958633, | |
| "rewards/accuracy_reward": 0.3958333544433117, | |
| "rewards/format_reward": 0.0, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 1043.2292175292969, | |
| "epoch": 0.512, | |
| "grad_norm": 3.836799144744873, | |
| "kl": 0.7412109375, | |
| "learning_rate": 2.78304639024076e-06, | |
| "loss": 0.2273, | |
| "reward": 0.4583333358168602, | |
| "reward_std": 0.2686738818883896, | |
| "rewards/accuracy_reward": 0.4583333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 1042.2291870117188, | |
| "epoch": 0.5146666666666667, | |
| "grad_norm": 3.3550195693969727, | |
| "kl": 0.6171875, | |
| "learning_rate": 2.7794159960791125e-06, | |
| "loss": 0.1258, | |
| "reward": 0.5625000149011612, | |
| "reward_std": 0.1705274023115635, | |
| "rewards/accuracy_reward": 0.5625000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 1017.2500305175781, | |
| "epoch": 0.5173333333333333, | |
| "grad_norm": 12.088400840759277, | |
| "kl": 1.6103515625, | |
| "learning_rate": 2.775757887663525e-06, | |
| "loss": 0.2264, | |
| "reward": 0.45833333395421505, | |
| "reward_std": 0.3977733328938484, | |
| "rewards/accuracy_reward": 0.45833333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 995.3542175292969, | |
| "epoch": 0.52, | |
| "grad_norm": 4.430953502655029, | |
| "kl": 0.4501953125, | |
| "learning_rate": 2.772072144234639e-06, | |
| "loss": 0.0376, | |
| "reward": 0.3750000111758709, | |
| "reward_std": 0.3776952587068081, | |
| "rewards/accuracy_reward": 0.3750000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 707.4375076293945, | |
| "epoch": 0.5226666666666666, | |
| "grad_norm": 2.779906749725342, | |
| "kl": 0.2867431640625, | |
| "learning_rate": 2.7683588456317177e-06, | |
| "loss": 0.0161, | |
| "reward": 0.6250000223517418, | |
| "reward_std": 0.38552645593881607, | |
| "rewards/accuracy_reward": 0.6250000223517418, | |
| "rewards/format_reward": 0.0, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 682.6250152587891, | |
| "epoch": 0.5253333333333333, | |
| "grad_norm": 1.7151806354522705, | |
| "kl": 0.238525390625, | |
| "learning_rate": 2.764618072290913e-06, | |
| "loss": -0.0224, | |
| "reward": 0.37500000558793545, | |
| "reward_std": 0.32097671553492546, | |
| "rewards/accuracy_reward": 0.37500000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 908.3541717529297, | |
| "epoch": 0.528, | |
| "grad_norm": 5.420147895812988, | |
| "kl": 0.329833984375, | |
| "learning_rate": 2.7608499052435266e-06, | |
| "loss": 0.0899, | |
| "reward": 0.583333358168602, | |
| "reward_std": 0.4500761702656746, | |
| "rewards/accuracy_reward": 0.583333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 823.0208587646484, | |
| "epoch": 0.5306666666666666, | |
| "grad_norm": 2.1091978549957275, | |
| "kl": 0.2232666015625, | |
| "learning_rate": 2.757054426114251e-06, | |
| "loss": 0.0821, | |
| "reward": 0.666666679084301, | |
| "reward_std": 0.22155843675136566, | |
| "rewards/accuracy_reward": 0.666666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 670.4583587646484, | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 3.2582030296325684, | |
| "kl": 0.3426513671875, | |
| "learning_rate": 2.753231717119405e-06, | |
| "loss": -0.0217, | |
| "reward": 0.6250000149011612, | |
| "reward_std": 0.2861081585288048, | |
| "rewards/accuracy_reward": 0.6250000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 620.9583435058594, | |
| "epoch": 0.536, | |
| "grad_norm": 2.6548030376434326, | |
| "kl": 0.317626953125, | |
| "learning_rate": 2.749381861065149e-06, | |
| "loss": 0.0096, | |
| "reward": 0.4375000111758709, | |
| "reward_std": 0.28219256177544594, | |
| "rewards/accuracy_reward": 0.4375000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 744.6041870117188, | |
| "epoch": 0.5386666666666666, | |
| "grad_norm": 1.7339693307876587, | |
| "kl": 0.538330078125, | |
| "learning_rate": 2.7455049413456964e-06, | |
| "loss": 0.0956, | |
| "reward": 0.47916669212281704, | |
| "reward_std": 0.28219255805015564, | |
| "rewards/accuracy_reward": 0.47916669212281704, | |
| "rewards/format_reward": 0.0, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 586.5208511352539, | |
| "epoch": 0.5413333333333333, | |
| "grad_norm": 4.914543151855469, | |
| "kl": 0.373291015625, | |
| "learning_rate": 2.741601041941501e-06, | |
| "loss": 0.0462, | |
| "reward": 0.645833358168602, | |
| "reward_std": 0.33713919669389725, | |
| "rewards/accuracy_reward": 0.645833358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 734.645866394043, | |
| "epoch": 0.544, | |
| "grad_norm": 4.155093193054199, | |
| "kl": 0.401123046875, | |
| "learning_rate": 2.7376702474174426e-06, | |
| "loss": -0.0151, | |
| "reward": 0.604166679084301, | |
| "reward_std": 0.2350771352648735, | |
| "rewards/accuracy_reward": 0.604166679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 737.4375305175781, | |
| "epoch": 0.5466666666666666, | |
| "grad_norm": 3.2076289653778076, | |
| "kl": 0.97149658203125, | |
| "learning_rate": 2.7337126429209934e-06, | |
| "loss": 0.0455, | |
| "reward": 0.5625000149011612, | |
| "reward_std": 0.33713920041918755, | |
| "rewards/accuracy_reward": 0.5625000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 620.2500152587891, | |
| "epoch": 0.5493333333333333, | |
| "grad_norm": 28.125852584838867, | |
| "kl": 4.544921875, | |
| "learning_rate": 2.729728314180373e-06, | |
| "loss": 0.0901, | |
| "reward": 0.479166679084301, | |
| "reward_std": 0.42872631549835205, | |
| "rewards/accuracy_reward": 0.479166679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 894.8333587646484, | |
| "epoch": 0.552, | |
| "grad_norm": 43.36728286743164, | |
| "kl": 1.431640625, | |
| "learning_rate": 2.725717347502693e-06, | |
| "loss": 0.14, | |
| "reward": 0.3958333432674408, | |
| "reward_std": 0.36417659372091293, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 730.4583587646484, | |
| "epoch": 0.5546666666666666, | |
| "grad_norm": 15.029769897460938, | |
| "kl": 1.0419921875, | |
| "learning_rate": 2.7216798297720855e-06, | |
| "loss": 0.0142, | |
| "reward": 0.2916666753590107, | |
| "reward_std": 0.2861081659793854, | |
| "rewards/accuracy_reward": 0.2916666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 599.1666870117188, | |
| "epoch": 0.5573333333333333, | |
| "grad_norm": 2.065986394882202, | |
| "kl": 0.42669677734375, | |
| "learning_rate": 2.7176158484478224e-06, | |
| "loss": 0.0308, | |
| "reward": 0.5416666716337204, | |
| "reward_std": 0.3776952847838402, | |
| "rewards/accuracy_reward": 0.5416666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 697.2083435058594, | |
| "epoch": 0.56, | |
| "grad_norm": 0.4965348541736603, | |
| "kl": 0.12176513671875, | |
| "learning_rate": 2.713525491562421e-06, | |
| "loss": 0.0151, | |
| "reward": 0.5416666772216558, | |
| "reward_std": 0.23899271339178085, | |
| "rewards/accuracy_reward": 0.5416666772216558, | |
| "rewards/format_reward": 0.0, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 594.5208587646484, | |
| "epoch": 0.5626666666666666, | |
| "grad_norm": 8.045109748840332, | |
| "kl": 0.330322265625, | |
| "learning_rate": 2.709408847719737e-06, | |
| "loss": 0.0581, | |
| "reward": 0.5625000111758709, | |
| "reward_std": 0.37377967685461044, | |
| "rewards/accuracy_reward": 0.5625000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 852.3333435058594, | |
| "epoch": 0.5653333333333334, | |
| "grad_norm": 2.485161066055298, | |
| "kl": 0.36962890625, | |
| "learning_rate": 2.705266006093043e-06, | |
| "loss": 0.0566, | |
| "reward": 0.645833358168602, | |
| "reward_std": 0.37377968057990074, | |
| "rewards/accuracy_reward": 0.645833358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 788.0833435058594, | |
| "epoch": 0.568, | |
| "grad_norm": 2.3095505237579346, | |
| "kl": 0.4625244140625, | |
| "learning_rate": 2.7010970564231e-06, | |
| "loss": 0.0677, | |
| "reward": 0.6041666828095913, | |
| "reward_std": 0.27258946746587753, | |
| "rewards/accuracy_reward": 0.6041666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 673.5833587646484, | |
| "epoch": 0.5706666666666667, | |
| "grad_norm": 40.70173645019531, | |
| "kl": 5.20703125, | |
| "learning_rate": 2.696902089016213e-06, | |
| "loss": 0.3097, | |
| "reward": 0.4791666753590107, | |
| "reward_std": 0.40952012687921524, | |
| "rewards/accuracy_reward": 0.4791666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 735.2708511352539, | |
| "epoch": 0.5733333333333334, | |
| "grad_norm": 5.52685022354126, | |
| "kl": 1.55078125, | |
| "learning_rate": 2.6926811947422717e-06, | |
| "loss": -0.021, | |
| "reward": 0.5208333358168602, | |
| "reward_std": 0.3170611336827278, | |
| "rewards/accuracy_reward": 0.5208333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 659.0000228881836, | |
| "epoch": 0.576, | |
| "grad_norm": 7.676098346710205, | |
| "kl": 0.54638671875, | |
| "learning_rate": 2.688434465032786e-06, | |
| "loss": 0.132, | |
| "reward": 0.5833333432674408, | |
| "reward_std": 0.4326419085264206, | |
| "rewards/accuracy_reward": 0.5833333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 651.1666717529297, | |
| "epoch": 0.5786666666666667, | |
| "grad_norm": 2.9158401489257812, | |
| "kl": 0.712890625, | |
| "learning_rate": 2.6841619918789038e-06, | |
| "loss": 0.0471, | |
| "reward": 0.6250000149011612, | |
| "reward_std": 0.26603010296821594, | |
| "rewards/accuracy_reward": 0.6250000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 708.9166870117188, | |
| "epoch": 0.5813333333333334, | |
| "grad_norm": 7.161975383758545, | |
| "kl": 1.939453125, | |
| "learning_rate": 2.679863867829417e-06, | |
| "loss": 0.199, | |
| "reward": 0.3750000149011612, | |
| "reward_std": 0.26603008806705475, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 791.8958435058594, | |
| "epoch": 0.584, | |
| "grad_norm": 3.1737961769104004, | |
| "kl": 0.8603515625, | |
| "learning_rate": 2.67554018598876e-06, | |
| "loss": 0.0281, | |
| "reward": 0.479166679084301, | |
| "reward_std": 0.2350771278142929, | |
| "rewards/accuracy_reward": 0.479166679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 940.2708435058594, | |
| "epoch": 0.5866666666666667, | |
| "grad_norm": 4.666038990020752, | |
| "kl": 0.8388671875, | |
| "learning_rate": 2.671191040014989e-06, | |
| "loss": 0.0128, | |
| "reward": 0.541666679084301, | |
| "reward_std": 0.3506578803062439, | |
| "rewards/accuracy_reward": 0.541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 721.3958435058594, | |
| "epoch": 0.5893333333333334, | |
| "grad_norm": 0.6132449507713318, | |
| "kl": 0.56298828125, | |
| "learning_rate": 2.666816524117757e-06, | |
| "loss": -0.0265, | |
| "reward": 0.2708333395421505, | |
| "reward_std": 0.2446802221238613, | |
| "rewards/accuracy_reward": 0.2708333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 859.0416870117188, | |
| "epoch": 0.592, | |
| "grad_norm": 4.1063385009765625, | |
| "kl": 0.29248046875, | |
| "learning_rate": 2.6624167330562694e-06, | |
| "loss": -0.0277, | |
| "reward": 0.4791666828095913, | |
| "reward_std": 0.34674228727817535, | |
| "rewards/accuracy_reward": 0.4791666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 823.2291870117188, | |
| "epoch": 0.5946666666666667, | |
| "grad_norm": 1.2631244659423828, | |
| "kl": 0.5458984375, | |
| "learning_rate": 2.657991762137235e-06, | |
| "loss": 0.0232, | |
| "reward": 0.2500000037252903, | |
| "reward_std": 0.2957112602889538, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 575.0000152587891, | |
| "epoch": 0.5973333333333334, | |
| "grad_norm": 5.164083957672119, | |
| "kl": 3.855224609375, | |
| "learning_rate": 2.653541707212799e-06, | |
| "loss": -0.0518, | |
| "reward": 0.3333333395421505, | |
| "reward_std": 0.3332235999405384, | |
| "rewards/accuracy_reward": 0.3333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 526.0000228881836, | |
| "epoch": 0.6, | |
| "grad_norm": 4.015169143676758, | |
| "kl": 1.4453125, | |
| "learning_rate": 2.649066664678467e-06, | |
| "loss": -0.0142, | |
| "reward": 0.6250000223517418, | |
| "reward_std": 0.4230388030409813, | |
| "rewards/accuracy_reward": 0.6250000223517418, | |
| "rewards/format_reward": 0.0, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 660.2083587646484, | |
| "epoch": 0.6026666666666667, | |
| "grad_norm": 2.765709400177002, | |
| "kl": 2.708984375, | |
| "learning_rate": 2.6445667314710174e-06, | |
| "loss": 0.0736, | |
| "reward": 0.4375000149011612, | |
| "reward_std": 0.21764283254742622, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 628.0000076293945, | |
| "epoch": 0.6053333333333333, | |
| "grad_norm": 1.9771387577056885, | |
| "kl": 0.358154296875, | |
| "learning_rate": 2.6400420050664027e-06, | |
| "loss": 0.0267, | |
| "reward": 0.5833333432674408, | |
| "reward_std": 0.3506578728556633, | |
| "rewards/accuracy_reward": 0.5833333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 596.3750228881836, | |
| "epoch": 0.608, | |
| "grad_norm": 9.701559066772461, | |
| "kl": 2.94921875, | |
| "learning_rate": 2.6354925834776346e-06, | |
| "loss": 0.1108, | |
| "reward": 0.6666666865348816, | |
| "reward_std": 0.40296073257923126, | |
| "rewards/accuracy_reward": 0.6666666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 695.5208587646484, | |
| "epoch": 0.6106666666666667, | |
| "grad_norm": 10.11015796661377, | |
| "kl": 2.19921875, | |
| "learning_rate": 2.6309185652526653e-06, | |
| "loss": 0.1138, | |
| "reward": 0.29166667349636555, | |
| "reward_std": 0.31314554437994957, | |
| "rewards/accuracy_reward": 0.29166667349636555, | |
| "rewards/format_reward": 0.0, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 897.9583435058594, | |
| "epoch": 0.6133333333333333, | |
| "grad_norm": 4.603687763214111, | |
| "kl": 2.24267578125, | |
| "learning_rate": 2.626320049472249e-06, | |
| "loss": 0.0939, | |
| "reward": 0.3958333469927311, | |
| "reward_std": 0.3266642242670059, | |
| "rewards/accuracy_reward": 0.3958333469927311, | |
| "rewards/format_reward": 0.0, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 647.6041870117188, | |
| "epoch": 0.616, | |
| "grad_norm": 3.770287036895752, | |
| "kl": 1.5478515625, | |
| "learning_rate": 2.621697135747798e-06, | |
| "loss": 0.0139, | |
| "reward": 0.4791666865348816, | |
| "reward_std": 0.3720077723264694, | |
| "rewards/accuracy_reward": 0.4791666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 513.5833435058594, | |
| "epoch": 0.6186666666666667, | |
| "grad_norm": 2.456714153289795, | |
| "kl": 2.1630859375, | |
| "learning_rate": 2.6170499242192243e-06, | |
| "loss": 0.0842, | |
| "reward": 0.4375000149011612, | |
| "reward_std": 0.2446802221238613, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 921.4583740234375, | |
| "epoch": 0.6213333333333333, | |
| "grad_norm": 3.1872363090515137, | |
| "kl": 2.4111328125, | |
| "learning_rate": 2.6123785155527693e-06, | |
| "loss": 0.0178, | |
| "reward": 0.5000000074505806, | |
| "reward_std": 0.4797573611140251, | |
| "rewards/accuracy_reward": 0.5000000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 564.6458511352539, | |
| "epoch": 0.624, | |
| "grad_norm": 2.388471841812134, | |
| "kl": 1.87744140625, | |
| "learning_rate": 2.607683010938826e-06, | |
| "loss": -0.0138, | |
| "reward": 0.37500001303851604, | |
| "reward_std": 0.2861081659793854, | |
| "rewards/accuracy_reward": 0.37500001303851604, | |
| "rewards/format_reward": 0.0, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 594.4166870117188, | |
| "epoch": 0.6266666666666667, | |
| "grad_norm": 3.252562999725342, | |
| "kl": 0.6201171875, | |
| "learning_rate": 2.6029635120897432e-06, | |
| "loss": 0.008, | |
| "reward": 0.3333333432674408, | |
| "reward_std": 0.3680921792984009, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 556.0208435058594, | |
| "epoch": 0.6293333333333333, | |
| "grad_norm": 4.7144317626953125, | |
| "kl": 0.822265625, | |
| "learning_rate": 2.5982201212376253e-06, | |
| "loss": 0.1413, | |
| "reward": 0.3958333507180214, | |
| "reward_std": 0.35457348451018333, | |
| "rewards/accuracy_reward": 0.3958333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 661.8541870117188, | |
| "epoch": 0.632, | |
| "grad_norm": 2.591197967529297, | |
| "kl": 0.4580078125, | |
| "learning_rate": 2.5934529411321173e-06, | |
| "loss": 0.1095, | |
| "reward": 0.5000000149011612, | |
| "reward_std": 0.37592336907982826, | |
| "rewards/accuracy_reward": 0.5000000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 802.5625305175781, | |
| "epoch": 0.6346666666666667, | |
| "grad_norm": 2.123358964920044, | |
| "kl": 1.932373046875, | |
| "learning_rate": 2.588662075038178e-06, | |
| "loss": 0.2456, | |
| "reward": 0.3125000074505806, | |
| "reward_std": 0.2525114193558693, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 822.4583740234375, | |
| "epoch": 0.6373333333333333, | |
| "grad_norm": 3.56559157371521, | |
| "kl": 2.3046875, | |
| "learning_rate": 2.583847626733842e-06, | |
| "loss": 0.194, | |
| "reward": 0.458333358168602, | |
| "reward_std": 0.3506578654050827, | |
| "rewards/accuracy_reward": 0.458333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 815.1458435058594, | |
| "epoch": 0.64, | |
| "grad_norm": 1.742954134941101, | |
| "kl": 1.6240234375, | |
| "learning_rate": 2.5790097005079765e-06, | |
| "loss": 0.2733, | |
| "reward": 0.3333333358168602, | |
| "reward_std": 0.4056045189499855, | |
| "rewards/accuracy_reward": 0.3333333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 674.9375457763672, | |
| "epoch": 0.6426666666666667, | |
| "grad_norm": 1.7041113376617432, | |
| "kl": 0.5087890625, | |
| "learning_rate": 2.574148401158017e-06, | |
| "loss": 0.0284, | |
| "reward": 0.416666679084301, | |
| "reward_std": 0.24859580025076866, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 524.5833511352539, | |
| "epoch": 0.6453333333333333, | |
| "grad_norm": 2.1262567043304443, | |
| "kl": 0.33056640625, | |
| "learning_rate": 2.5692638339877007e-06, | |
| "loss": 0.0999, | |
| "reward": 0.7291666865348816, | |
| "reward_std": 0.37377967685461044, | |
| "rewards/accuracy_reward": 0.7291666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 697.5, | |
| "epoch": 0.648, | |
| "grad_norm": 0.536224365234375, | |
| "kl": 1.41162109375, | |
| "learning_rate": 2.5643561048047816e-06, | |
| "loss": 0.0552, | |
| "reward": 0.39583333395421505, | |
| "reward_std": 0.235077116638422, | |
| "rewards/accuracy_reward": 0.39583333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 722.6250305175781, | |
| "epoch": 0.6506666666666666, | |
| "grad_norm": 0.7712324261665344, | |
| "kl": 1.03076171875, | |
| "learning_rate": 2.559425319918743e-06, | |
| "loss": 0.0207, | |
| "reward": 0.5000000111758709, | |
| "reward_std": 0.19364918768405914, | |
| "rewards/accuracy_reward": 0.5000000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 853.6875152587891, | |
| "epoch": 0.6533333333333333, | |
| "grad_norm": 115.63752746582031, | |
| "kl": 3.2158203125, | |
| "learning_rate": 2.5544715861384928e-06, | |
| "loss": 0.3086, | |
| "reward": 0.0833333358168602, | |
| "reward_std": 0.16661179810762405, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 890.3958587646484, | |
| "epoch": 0.656, | |
| "grad_norm": 1.7879916429519653, | |
| "kl": 1.380859375, | |
| "learning_rate": 2.549495010770048e-06, | |
| "loss": 0.0694, | |
| "reward": 0.14583333395421505, | |
| "reward_std": 0.18796167895197868, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 719.2916870117188, | |
| "epoch": 0.6586666666666666, | |
| "grad_norm": 0.9578667283058167, | |
| "kl": 1.2451171875, | |
| "learning_rate": 2.5444957016142144e-06, | |
| "loss": 0.0935, | |
| "reward": 0.18750000186264515, | |
| "reward_std": 0.25515517219901085, | |
| "rewards/accuracy_reward": 0.18750000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 832.8958587646484, | |
| "epoch": 0.6613333333333333, | |
| "grad_norm": 0.5120651125907898, | |
| "kl": 0.66162109375, | |
| "learning_rate": 2.5394737669642457e-06, | |
| "loss": -0.0347, | |
| "reward": 0.2500000074505806, | |
| "reward_std": 0.3131455257534981, | |
| "rewards/accuracy_reward": 0.2500000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 647.9791870117188, | |
| "epoch": 0.664, | |
| "grad_norm": 0.7588649392127991, | |
| "kl": 0.511962890625, | |
| "learning_rate": 2.5344293156035046e-06, | |
| "loss": -0.0722, | |
| "reward": 0.2916666753590107, | |
| "reward_std": 0.3872983753681183, | |
| "rewards/accuracy_reward": 0.2916666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 553.7708435058594, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 2.4626846313476562, | |
| "kl": 0.6552734375, | |
| "learning_rate": 2.529362456803101e-06, | |
| "loss": -0.0393, | |
| "reward": 0.4583333432674408, | |
| "reward_std": 0.3506578877568245, | |
| "rewards/accuracy_reward": 0.4583333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 539.1666793823242, | |
| "epoch": 0.6693333333333333, | |
| "grad_norm": 1.022650957107544, | |
| "kl": 0.418212890625, | |
| "learning_rate": 2.5242733003195252e-06, | |
| "loss": 0.0062, | |
| "reward": 0.520833358168602, | |
| "reward_std": 0.1801304928958416, | |
| "rewards/accuracy_reward": 0.520833358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 657.2083587646484, | |
| "epoch": 0.672, | |
| "grad_norm": 1.0106619596481323, | |
| "kl": 0.384521484375, | |
| "learning_rate": 2.519161956392275e-06, | |
| "loss": 0.086, | |
| "reward": 0.416666679084301, | |
| "reward_std": 0.32097672671079636, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 598.6666717529297, | |
| "epoch": 0.6746666666666666, | |
| "grad_norm": 0.43861910700798035, | |
| "kl": 0.755126953125, | |
| "learning_rate": 2.514028535741463e-06, | |
| "loss": 0.0453, | |
| "reward": 0.4583333432674408, | |
| "reward_std": 0.30354243889451027, | |
| "rewards/accuracy_reward": 0.4583333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 729.8333587646484, | |
| "epoch": 0.6773333333333333, | |
| "grad_norm": 0.8300098776817322, | |
| "kl": 1.365478515625, | |
| "learning_rate": 2.5088731495654205e-06, | |
| "loss": -0.0343, | |
| "reward": 0.2916666716337204, | |
| "reward_std": 0.30354245007038116, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 585.8750152587891, | |
| "epoch": 0.68, | |
| "grad_norm": 3.1680819988250732, | |
| "kl": 1.72802734375, | |
| "learning_rate": 2.5036959095382875e-06, | |
| "loss": 0.0543, | |
| "reward": 0.2291666679084301, | |
| "reward_std": 0.21764283627271652, | |
| "rewards/accuracy_reward": 0.2291666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 701.7083587646484, | |
| "epoch": 0.6826666666666666, | |
| "grad_norm": 3.7772672176361084, | |
| "kl": 1.54052734375, | |
| "learning_rate": 2.4984969278075954e-06, | |
| "loss": 0.0048, | |
| "reward": 0.22916666977107525, | |
| "reward_std": 0.30745804682374, | |
| "rewards/accuracy_reward": 0.22916666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 692.0000076293945, | |
| "epoch": 0.6853333333333333, | |
| "grad_norm": 11.947521209716797, | |
| "kl": 3.556640625, | |
| "learning_rate": 2.4932763169918353e-06, | |
| "loss": 0.0981, | |
| "reward": 0.18750000186264515, | |
| "reward_std": 0.2525114119052887, | |
| "rewards/accuracy_reward": 0.18750000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 586.1458511352539, | |
| "epoch": 0.688, | |
| "grad_norm": 1.9000444412231445, | |
| "kl": 1.68359375, | |
| "learning_rate": 2.4880341901780208e-06, | |
| "loss": 0.0541, | |
| "reward": 0.1875000111758709, | |
| "reward_std": 0.23507710546255112, | |
| "rewards/accuracy_reward": 0.1875000111758709, | |
| "rewards/format_reward": 0.0, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 483.18750762939453, | |
| "epoch": 0.6906666666666667, | |
| "grad_norm": 1.4334735870361328, | |
| "kl": 2.0693359375, | |
| "learning_rate": 2.4827706609192375e-06, | |
| "loss": -0.0813, | |
| "reward": 0.2500000037252903, | |
| "reward_std": 0.31314555555582047, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 479.43751525878906, | |
| "epoch": 0.6933333333333334, | |
| "grad_norm": 1.8273588418960571, | |
| "kl": 1.2958984375, | |
| "learning_rate": 2.477485843232183e-06, | |
| "loss": 0.031, | |
| "reward": 0.06250000186264515, | |
| "reward_std": 0.1530931107699871, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 468.50001525878906, | |
| "epoch": 0.696, | |
| "grad_norm": 10.838756561279297, | |
| "kl": 0.923828125, | |
| "learning_rate": 2.4721798515946964e-06, | |
| "loss": -0.005, | |
| "reward": 0.2083333358168602, | |
| "reward_std": 0.3061862140893936, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 668.9375152587891, | |
| "epoch": 0.6986666666666667, | |
| "grad_norm": 1.042129635810852, | |
| "kl": 0.728515625, | |
| "learning_rate": 2.4668528009432804e-06, | |
| "loss": -0.013, | |
| "reward": 0.1041666679084301, | |
| "reward_std": 0.13301505148410797, | |
| "rewards/accuracy_reward": 0.1041666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 398.3958511352539, | |
| "epoch": 0.7013333333333334, | |
| "grad_norm": 3.4529807567596436, | |
| "kl": 0.791015625, | |
| "learning_rate": 2.4615048066706103e-06, | |
| "loss": -0.0484, | |
| "reward": 0.06250000186264515, | |
| "reward_std": 0.11558076366782188, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 423.5208435058594, | |
| "epoch": 0.704, | |
| "grad_norm": 0.7864275574684143, | |
| "kl": 0.6142578125, | |
| "learning_rate": 2.456135984623035e-06, | |
| "loss": -0.0284, | |
| "reward": 0.0416666679084301, | |
| "reward_std": 0.10206206887960434, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 473.39584732055664, | |
| "epoch": 0.7066666666666667, | |
| "grad_norm": 3.637108564376831, | |
| "kl": 0.5068359375, | |
| "learning_rate": 2.4507464510980654e-06, | |
| "loss": -0.04, | |
| "reward": 0.2708333469927311, | |
| "reward_std": 0.30922994762659073, | |
| "rewards/accuracy_reward": 0.2708333469927311, | |
| "rewards/format_reward": 0.0, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 455.2291793823242, | |
| "epoch": 0.7093333333333334, | |
| "grad_norm": 1.2332038879394531, | |
| "kl": 0.580078125, | |
| "learning_rate": 2.44533632284186e-06, | |
| "loss": 0.0084, | |
| "reward": 0.14583333767950535, | |
| "reward_std": 0.2350771315395832, | |
| "rewards/accuracy_reward": 0.14583333767950535, | |
| "rewards/format_reward": 0.0, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 636.5625152587891, | |
| "epoch": 0.712, | |
| "grad_norm": 4.876010417938232, | |
| "kl": 0.791015625, | |
| "learning_rate": 2.439905717046691e-06, | |
| "loss": -0.0555, | |
| "reward": 0.16666666977107525, | |
| "reward_std": 0.18404608592391014, | |
| "rewards/accuracy_reward": 0.16666666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 663.1041717529297, | |
| "epoch": 0.7146666666666667, | |
| "grad_norm": 1.758858561515808, | |
| "kl": 0.2115478515625, | |
| "learning_rate": 2.434454751348408e-06, | |
| "loss": 0.0277, | |
| "reward": 0.5625000149011612, | |
| "reward_std": 0.4932760149240494, | |
| "rewards/accuracy_reward": 0.5625000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 617.1042022705078, | |
| "epoch": 0.7173333333333334, | |
| "grad_norm": 2.5681865215301514, | |
| "kl": 0.284912109375, | |
| "learning_rate": 2.4289835438238904e-06, | |
| "loss": -0.0989, | |
| "reward": 0.3958333469927311, | |
| "reward_std": 0.4758417531847954, | |
| "rewards/accuracy_reward": 0.3958333469927311, | |
| "rewards/format_reward": 0.0, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 676.3541870117188, | |
| "epoch": 0.72, | |
| "grad_norm": 0.852873682975769, | |
| "kl": 0.2982177734375, | |
| "learning_rate": 2.4234922129884873e-06, | |
| "loss": -0.0539, | |
| "reward": 0.39583334885537624, | |
| "reward_std": 0.40168890357017517, | |
| "rewards/accuracy_reward": 0.39583334885537624, | |
| "rewards/format_reward": 0.0, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 613.6041870117188, | |
| "epoch": 0.7226666666666667, | |
| "grad_norm": 4.721111297607422, | |
| "kl": 0.610107421875, | |
| "learning_rate": 2.417980877793454e-06, | |
| "loss": 0.0059, | |
| "reward": 0.2291666753590107, | |
| "reward_std": 0.24468021839857101, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 618.0000152587891, | |
| "epoch": 0.7253333333333334, | |
| "grad_norm": 6.144564151763916, | |
| "kl": 1.78515625, | |
| "learning_rate": 2.4124496576233714e-06, | |
| "loss": 0.0333, | |
| "reward": 0.1666666679084301, | |
| "reward_std": 0.23899272084236145, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 540.5833358764648, | |
| "epoch": 0.728, | |
| "grad_norm": 16.126615524291992, | |
| "kl": 0.8798828125, | |
| "learning_rate": 2.4068986722935626e-06, | |
| "loss": -0.0133, | |
| "reward": 0.1875000074505806, | |
| "reward_std": 0.24468021839857101, | |
| "rewards/accuracy_reward": 0.1875000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 453.3958435058594, | |
| "epoch": 0.7306666666666667, | |
| "grad_norm": 235.8284454345703, | |
| "kl": 1.6103515625, | |
| "learning_rate": 2.4013280420474953e-06, | |
| "loss": 0.0641, | |
| "reward": 0.06250000186264515, | |
| "reward_std": 0.11558076739311218, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 458.7291793823242, | |
| "epoch": 0.7333333333333333, | |
| "grad_norm": 5939.08154296875, | |
| "kl": 16.091796875, | |
| "learning_rate": 2.3957378875541795e-06, | |
| "loss": 0.8365, | |
| "reward": 0.0625, | |
| "reward_std": 0.06846532225608826, | |
| "rewards/accuracy_reward": 0.0625, | |
| "rewards/format_reward": 0.0, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 541.1666946411133, | |
| "epoch": 0.736, | |
| "grad_norm": 7498.9140625, | |
| "kl": 78.00390625, | |
| "learning_rate": 2.3901283299055523e-06, | |
| "loss": 2.987, | |
| "reward": 0.12500000558793545, | |
| "reward_std": 0.10206207260489464, | |
| "rewards/accuracy_reward": 0.12500000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 546.1666946411133, | |
| "epoch": 0.7386666666666667, | |
| "grad_norm": 80.91847229003906, | |
| "kl": 7.0234375, | |
| "learning_rate": 2.3844994906138548e-06, | |
| "loss": 0.1409, | |
| "reward": 0.14583333395421505, | |
| "reward_std": 0.18796168267726898, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 548.6666870117188, | |
| "epoch": 0.7413333333333333, | |
| "grad_norm": 30603.955078125, | |
| "kl": 68.61328125, | |
| "learning_rate": 2.3788514916090007e-06, | |
| "loss": 2.6195, | |
| "reward": 0.1875, | |
| "reward_std": 0.2525114044547081, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/format_reward": 0.0, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 619.6458587646484, | |
| "epoch": 0.744, | |
| "grad_norm": 126.99922943115234, | |
| "kl": 2.765625, | |
| "learning_rate": 2.3731844552359343e-06, | |
| "loss": 0.1247, | |
| "reward": 0.06250000186264515, | |
| "reward_std": 0.11558076366782188, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 622.7083435058594, | |
| "epoch": 0.7466666666666667, | |
| "grad_norm": 220.41571044921875, | |
| "kl": 1.99169921875, | |
| "learning_rate": 2.36749850425198e-06, | |
| "loss": 0.0629, | |
| "reward": 0.1875000037252903, | |
| "reward_std": 0.19756478071212769, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 778.5625305175781, | |
| "epoch": 0.7493333333333333, | |
| "grad_norm": 179.80470275878906, | |
| "kl": 0.821533203125, | |
| "learning_rate": 2.3617937618241844e-06, | |
| "loss": 0.0322, | |
| "reward": 0.22916667349636555, | |
| "reward_std": 0.35457346215844154, | |
| "rewards/accuracy_reward": 0.22916667349636555, | |
| "rewards/format_reward": 0.0, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 634.8541793823242, | |
| "epoch": 0.752, | |
| "grad_norm": 10.700740814208984, | |
| "kl": 1.4990234375, | |
| "learning_rate": 2.356070351526648e-06, | |
| "loss": 0.0582, | |
| "reward": 0.14583333767950535, | |
| "reward_std": 0.1801304928958416, | |
| "rewards/accuracy_reward": 0.14583333767950535, | |
| "rewards/format_reward": 0.0, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 759.5208740234375, | |
| "epoch": 0.7546666666666667, | |
| "grad_norm": 9.8982572555542, | |
| "kl": 2.18798828125, | |
| "learning_rate": 2.3503283973378465e-06, | |
| "loss": 0.1048, | |
| "reward": 0.12500000558793545, | |
| "reward_std": 0.10206207633018494, | |
| "rewards/accuracy_reward": 0.12500000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 692.6666717529297, | |
| "epoch": 0.7573333333333333, | |
| "grad_norm": 3.14382004737854, | |
| "kl": 0.900390625, | |
| "learning_rate": 2.344568023637949e-06, | |
| "loss": 0.0795, | |
| "reward": 0.16666666977107525, | |
| "reward_std": 0.24859581515192986, | |
| "rewards/accuracy_reward": 0.16666666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 646.4583587646484, | |
| "epoch": 0.76, | |
| "grad_norm": 662.5714111328125, | |
| "kl": 3.06103515625, | |
| "learning_rate": 2.3387893552061204e-06, | |
| "loss": 0.1911, | |
| "reward": 0.1875, | |
| "reward_std": 0.06846532225608826, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/format_reward": 0.0, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 568.3750228881836, | |
| "epoch": 0.7626666666666667, | |
| "grad_norm": 68222.53125, | |
| "kl": 257.8154296875, | |
| "learning_rate": 2.332992517217819e-06, | |
| "loss": 7.0561, | |
| "reward": 0.2916666828095913, | |
| "reward_std": 0.16661180555820465, | |
| "rewards/accuracy_reward": 0.2916666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 665.0625305175781, | |
| "epoch": 0.7653333333333333, | |
| "grad_norm": 3179.5244140625, | |
| "kl": 2.095703125, | |
| "learning_rate": 2.327177635242086e-06, | |
| "loss": 0.0406, | |
| "reward": 0.1250000037252903, | |
| "reward_std": 0.22155844047665596, | |
| "rewards/accuracy_reward": 0.1250000037252903, | |
| "rewards/format_reward": 0.0, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 644.2083511352539, | |
| "epoch": 0.768, | |
| "grad_norm": 36765.8671875, | |
| "kl": 173.8992919921875, | |
| "learning_rate": 2.3213448352388254e-06, | |
| "loss": 7.3332, | |
| "reward": 0.47916667722165585, | |
| "reward_std": 0.1530931070446968, | |
| "rewards/accuracy_reward": 0.47916667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 546.6250076293945, | |
| "epoch": 0.7706666666666667, | |
| "grad_norm": 999.5093383789062, | |
| "kl": 4.895263671875, | |
| "learning_rate": 2.315494243556075e-06, | |
| "loss": 0.1335, | |
| "reward": 0.2291666753590107, | |
| "reward_std": 0.2350771315395832, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 656.7083435058594, | |
| "epoch": 0.7733333333333333, | |
| "grad_norm": 4107.3349609375, | |
| "kl": 6.30517578125, | |
| "learning_rate": 2.3096259869272697e-06, | |
| "loss": 0.2397, | |
| "reward": 0.2291666753590107, | |
| "reward_std": 0.2621144950389862, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 636.6875152587891, | |
| "epoch": 0.776, | |
| "grad_norm": 7.706014633178711, | |
| "kl": 0.93994140625, | |
| "learning_rate": 2.303740192468495e-06, | |
| "loss": 0.0058, | |
| "reward": 0.18750000186264515, | |
| "reward_std": 0.21764283999800682, | |
| "rewards/accuracy_reward": 0.18750000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 515.2083511352539, | |
| "epoch": 0.7786666666666666, | |
| "grad_norm": 4894.8564453125, | |
| "kl": 180.23291015625, | |
| "learning_rate": 2.2978369876757365e-06, | |
| "loss": 8.3769, | |
| "reward": 0.2291666753590107, | |
| "reward_std": 0.21764283999800682, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 626.1666870117188, | |
| "epoch": 0.7813333333333333, | |
| "grad_norm": 2025.8204345703125, | |
| "kl": 36.9638671875, | |
| "learning_rate": 2.2919165004221152e-06, | |
| "loss": 1.4316, | |
| "reward": 0.5208333432674408, | |
| "reward_std": 0.1705273911356926, | |
| "rewards/accuracy_reward": 0.5208333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 644.4166870117188, | |
| "epoch": 0.784, | |
| "grad_norm": 2703.410888671875, | |
| "kl": 4.21484375, | |
| "learning_rate": 2.285978858955119e-06, | |
| "loss": 0.1355, | |
| "reward": 0.3541666716337204, | |
| "reward_std": 0.2900237664580345, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 605.5000228881836, | |
| "epoch": 0.7866666666666666, | |
| "grad_norm": 52.43381881713867, | |
| "kl": 1.0858154296875, | |
| "learning_rate": 2.280024191893823e-06, | |
| "loss": 0.0851, | |
| "reward": 0.5416666865348816, | |
| "reward_std": 0.20412414148449898, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 741.2916870117188, | |
| "epoch": 0.7893333333333333, | |
| "grad_norm": 93.758056640625, | |
| "kl": 0.751220703125, | |
| "learning_rate": 2.274052628226107e-06, | |
| "loss": -0.0145, | |
| "reward": 0.3958333358168602, | |
| "reward_std": 0.39121396839618683, | |
| "rewards/accuracy_reward": 0.3958333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 699.8958587646484, | |
| "epoch": 0.792, | |
| "grad_norm": 62896.68359375, | |
| "kl": 9.72021484375, | |
| "learning_rate": 2.268064297305857e-06, | |
| "loss": 0.5246, | |
| "reward": 0.2083333395421505, | |
| "reward_std": 0.23116152733564377, | |
| "rewards/accuracy_reward": 0.2083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 647.5416870117188, | |
| "epoch": 0.7946666666666666, | |
| "grad_norm": 30.405757904052734, | |
| "kl": 0.86474609375, | |
| "learning_rate": 2.2620593288501667e-06, | |
| "loss": 0.0116, | |
| "reward": 0.4375000074505806, | |
| "reward_std": 0.2525114119052887, | |
| "rewards/accuracy_reward": 0.4375000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 659.2083435058594, | |
| "epoch": 0.7973333333333333, | |
| "grad_norm": 3.99859356880188, | |
| "kl": 0.583648681640625, | |
| "learning_rate": 2.256037852936525e-06, | |
| "loss": -0.0159, | |
| "reward": 0.29166667349636555, | |
| "reward_std": 0.16661179810762405, | |
| "rewards/accuracy_reward": 0.29166667349636555, | |
| "rewards/format_reward": 0.0, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 637.7708587646484, | |
| "epoch": 0.8, | |
| "grad_norm": 27.611656188964844, | |
| "kl": 2.47705078125, | |
| "learning_rate": 2.25e-06, | |
| "loss": 0.1998, | |
| "reward": 0.35416668839752674, | |
| "reward_std": 0.36417656019330025, | |
| "rewards/accuracy_reward": 0.35416668839752674, | |
| "rewards/format_reward": 0.0, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 682.1666870117188, | |
| "epoch": 0.8026666666666666, | |
| "grad_norm": 1630.4654541015625, | |
| "kl": 7.66650390625, | |
| "learning_rate": 2.243945900830413e-06, | |
| "loss": 0.1792, | |
| "reward": 0.27083334513008595, | |
| "reward_std": 0.28219256177544594, | |
| "rewards/accuracy_reward": 0.27083334513008595, | |
| "rewards/format_reward": 0.0, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 413.50001525878906, | |
| "epoch": 0.8053333333333333, | |
| "grad_norm": 88005.484375, | |
| "kl": 838.18359375, | |
| "learning_rate": 2.237875686569506e-06, | |
| "loss": 48.0791, | |
| "reward": 0.0833333358168602, | |
| "reward_std": 0.11949636787176132, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 614.7916870117188, | |
| "epoch": 0.808, | |
| "grad_norm": 111.8994140625, | |
| "kl": 1.419921875, | |
| "learning_rate": 2.231789488708099e-06, | |
| "loss": 0.075, | |
| "reward": 0.1041666716337204, | |
| "reward_std": 0.05103103816509247, | |
| "rewards/accuracy_reward": 0.1041666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 629.0000305175781, | |
| "epoch": 0.8106666666666666, | |
| "grad_norm": 6898.142578125, | |
| "kl": 24.58984375, | |
| "learning_rate": 2.2256874390832447e-06, | |
| "loss": 1.1764, | |
| "reward": 0.16666667722165585, | |
| "reward_std": 0.18404607102274895, | |
| "rewards/accuracy_reward": 0.16666667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 717.2291870117188, | |
| "epoch": 0.8133333333333334, | |
| "grad_norm": 8847.892578125, | |
| "kl": 19.443359375, | |
| "learning_rate": 2.2195696698753695e-06, | |
| "loss": 1.5009, | |
| "reward": 0.0416666679084301, | |
| "reward_std": 0.06454972922801971, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 743.3333435058594, | |
| "epoch": 0.816, | |
| "grad_norm": 458.5970458984375, | |
| "kl": 6.7021484375, | |
| "learning_rate": 2.213436313605413e-06, | |
| "loss": 0.3691, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.0, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 934.1250152587891, | |
| "epoch": 0.8186666666666667, | |
| "grad_norm": 1261.3668212890625, | |
| "kl": 20.8359375, | |
| "learning_rate": 2.2072875031319556e-06, | |
| "loss": 0.5689, | |
| "reward": 0.0416666679084301, | |
| "reward_std": 0.06454972922801971, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 545.4583435058594, | |
| "epoch": 0.8213333333333334, | |
| "grad_norm": 56254.73046875, | |
| "kl": 186.546875, | |
| "learning_rate": 2.2011233716483416e-06, | |
| "loss": 8.0071, | |
| "reward": 0.02083333395421505, | |
| "reward_std": 0.05103103816509247, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 689.2708587646484, | |
| "epoch": 0.824, | |
| "grad_norm": 992.6975708007812, | |
| "kl": 4.853515625, | |
| "learning_rate": 2.1949440526797927e-06, | |
| "loss": 0.2266, | |
| "reward": 0.22916666977107525, | |
| "reward_std": 0.2446802221238613, | |
| "rewards/accuracy_reward": 0.22916666977107525, | |
| "rewards/format_reward": 0.0, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 575.5000152587891, | |
| "epoch": 0.8266666666666667, | |
| "grad_norm": 14.90085506439209, | |
| "kl": 0.2930908203125, | |
| "learning_rate": 2.1887496800805174e-06, | |
| "loss": 0.0365, | |
| "reward": 0.27083333395421505, | |
| "reward_std": 0.28219255805015564, | |
| "rewards/accuracy_reward": 0.27083333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 501.62500762939453, | |
| "epoch": 0.8293333333333334, | |
| "grad_norm": 13.801560401916504, | |
| "kl": 0.2686767578125, | |
| "learning_rate": 2.1825403880308107e-06, | |
| "loss": -0.0458, | |
| "reward": 0.5416666865348816, | |
| "reward_std": 0.16661179438233376, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 567.3958587646484, | |
| "epoch": 0.832, | |
| "grad_norm": 25.853025436401367, | |
| "kl": 0.1470947265625, | |
| "learning_rate": 2.1763163110341462e-06, | |
| "loss": 0.0468, | |
| "reward": 0.5625000149011612, | |
| "reward_std": 0.235077116638422, | |
| "rewards/accuracy_reward": 0.5625000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 664.5625076293945, | |
| "epoch": 0.8346666666666667, | |
| "grad_norm": 5866.689453125, | |
| "kl": 25.58026123046875, | |
| "learning_rate": 2.1700775839142652e-06, | |
| "loss": 1.0306, | |
| "reward": 0.7500000149011612, | |
| "reward_std": 0.2861081622540951, | |
| "rewards/accuracy_reward": 0.7500000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 897.1667022705078, | |
| "epoch": 0.8373333333333334, | |
| "grad_norm": 2.1541504859924316, | |
| "kl": 0.10302734375, | |
| "learning_rate": 2.1638243418122534e-06, | |
| "loss": 0.0213, | |
| "reward": 0.6875000149011612, | |
| "reward_std": 0.235077116638422, | |
| "rewards/accuracy_reward": 0.6875000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 633.4166793823242, | |
| "epoch": 0.84, | |
| "grad_norm": 74721.421875, | |
| "kl": 800.21728515625, | |
| "learning_rate": 2.157556720183616e-06, | |
| "loss": 31.9069, | |
| "reward": 0.29166667722165585, | |
| "reward_std": 0.16661179810762405, | |
| "rewards/accuracy_reward": 0.29166667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 810.0208740234375, | |
| "epoch": 0.8426666666666667, | |
| "grad_norm": 56.302738189697266, | |
| "kl": 0.1741943359375, | |
| "learning_rate": 2.151274854795342e-06, | |
| "loss": 0.0364, | |
| "reward": 0.39583334513008595, | |
| "reward_std": 0.28219256177544594, | |
| "rewards/accuracy_reward": 0.39583334513008595, | |
| "rewards/format_reward": 0.0, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 692.9583358764648, | |
| "epoch": 0.8453333333333334, | |
| "grad_norm": 2.8889639377593994, | |
| "kl": 0.11712646484375, | |
| "learning_rate": 2.1449788817229644e-06, | |
| "loss": 0.0226, | |
| "reward": 0.35416667722165585, | |
| "reward_std": 0.2446802221238613, | |
| "rewards/accuracy_reward": 0.35416667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 557.2708587646484, | |
| "epoch": 0.848, | |
| "grad_norm": 59.83365249633789, | |
| "kl": 0.3212890625, | |
| "learning_rate": 2.138668937347609e-06, | |
| "loss": 0.0498, | |
| "reward": 0.5000000223517418, | |
| "reward_std": 0.2957112565636635, | |
| "rewards/accuracy_reward": 0.5000000223517418, | |
| "rewards/format_reward": 0.0, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 585.0417022705078, | |
| "epoch": 0.8506666666666667, | |
| "grad_norm": 57.15522384643555, | |
| "kl": 0.13848876953125, | |
| "learning_rate": 2.132345158353047e-06, | |
| "loss": -0.0406, | |
| "reward": 0.6041666716337204, | |
| "reward_std": 0.18796167895197868, | |
| "rewards/accuracy_reward": 0.6041666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 590.7916870117188, | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 1.5622169971466064, | |
| "kl": 0.115234375, | |
| "learning_rate": 2.126007681722727e-06, | |
| "loss": 0.011, | |
| "reward": 0.5208333488553762, | |
| "reward_std": 0.1801304928958416, | |
| "rewards/accuracy_reward": 0.5208333488553762, | |
| "rewards/format_reward": 0.0, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 554.6458587646484, | |
| "epoch": 0.856, | |
| "grad_norm": 12.661802291870117, | |
| "kl": 0.06396484375, | |
| "learning_rate": 2.119656644736813e-06, | |
| "loss": 0.0302, | |
| "reward": 0.4791666716337204, | |
| "reward_std": 0.05103103816509247, | |
| "rewards/accuracy_reward": 0.4791666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 620.5625305175781, | |
| "epoch": 0.8586666666666667, | |
| "grad_norm": 0.5451918840408325, | |
| "kl": 0.12109375, | |
| "learning_rate": 2.113292184969207e-06, | |
| "loss": -0.0375, | |
| "reward": 0.5208333507180214, | |
| "reward_std": 0.28219256177544594, | |
| "rewards/accuracy_reward": 0.5208333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 674.4166793823242, | |
| "epoch": 0.8613333333333333, | |
| "grad_norm": 6.236075401306152, | |
| "kl": 0.08465576171875, | |
| "learning_rate": 2.106914440284572e-06, | |
| "loss": 0.0807, | |
| "reward": 0.3125000074505806, | |
| "reward_std": 0.36417658627033234, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 529.6875152587891, | |
| "epoch": 0.864, | |
| "grad_norm": 0.7760786414146423, | |
| "kl": 0.107666015625, | |
| "learning_rate": 2.100523548835343e-06, | |
| "loss": 0.0345, | |
| "reward": 0.770833358168602, | |
| "reward_std": 0.21764283627271652, | |
| "rewards/accuracy_reward": 0.770833358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 557.3541793823242, | |
| "epoch": 0.8666666666666667, | |
| "grad_norm": 0.35026639699935913, | |
| "kl": 0.1966552734375, | |
| "learning_rate": 2.0941196490587354e-06, | |
| "loss": 0.0203, | |
| "reward": 0.291666679084301, | |
| "reward_std": 0.23899272456765175, | |
| "rewards/accuracy_reward": 0.291666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 619.5000305175781, | |
| "epoch": 0.8693333333333333, | |
| "grad_norm": 1.0790166854858398, | |
| "kl": 0.156494140625, | |
| "learning_rate": 2.0877028796737477e-06, | |
| "loss": 0.0023, | |
| "reward": 0.4166666679084301, | |
| "reward_std": 0.2957112491130829, | |
| "rewards/accuracy_reward": 0.4166666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 806.5208587646484, | |
| "epoch": 0.872, | |
| "grad_norm": 0.9412611722946167, | |
| "kl": 0.1807861328125, | |
| "learning_rate": 2.0812733796781545e-06, | |
| "loss": 0.0285, | |
| "reward": 0.2500000074505806, | |
| "reward_std": 0.24859582632780075, | |
| "rewards/accuracy_reward": 0.2500000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 633.2083587646484, | |
| "epoch": 0.8746666666666667, | |
| "grad_norm": 2.870098114013672, | |
| "kl": 0.091064453125, | |
| "learning_rate": 2.0748312883454963e-06, | |
| "loss": -0.0113, | |
| "reward": 0.645833358168602, | |
| "reward_std": 0.2446802221238613, | |
| "rewards/accuracy_reward": 0.645833358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 572.8958587646484, | |
| "epoch": 0.8773333333333333, | |
| "grad_norm": 0.3554651141166687, | |
| "kl": 0.192626953125, | |
| "learning_rate": 2.068376745222062e-06, | |
| "loss": 0.0353, | |
| "reward": 0.4791666865348816, | |
| "reward_std": 0.28219255432486534, | |
| "rewards/accuracy_reward": 0.4791666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 773.5416870117188, | |
| "epoch": 0.88, | |
| "grad_norm": 0.2810516059398651, | |
| "kl": 0.16339111328125, | |
| "learning_rate": 2.061909890123868e-06, | |
| "loss": 0.018, | |
| "reward": 0.3333333432674408, | |
| "reward_std": 0.11949635669589043, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 846.6667022705078, | |
| "epoch": 0.8826666666666667, | |
| "grad_norm": 0.6509025692939758, | |
| "kl": 0.129791259765625, | |
| "learning_rate": 2.055430863133628e-06, | |
| "loss": 0.0358, | |
| "reward": 0.4166666865348816, | |
| "reward_std": 0.2957112491130829, | |
| "rewards/accuracy_reward": 0.4166666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 717.7500152587891, | |
| "epoch": 0.8853333333333333, | |
| "grad_norm": 0.8496950268745422, | |
| "kl": 0.230712890625, | |
| "learning_rate": 2.048939804597718e-06, | |
| "loss": 0.0931, | |
| "reward": 0.416666679084301, | |
| "reward_std": 0.4152075946331024, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 807.5000305175781, | |
| "epoch": 0.888, | |
| "grad_norm": 0.49586477875709534, | |
| "kl": 0.10491943359375, | |
| "learning_rate": 2.0424368551231384e-06, | |
| "loss": 0.0089, | |
| "reward": 0.5208333432674408, | |
| "reward_std": 0.3170611187815666, | |
| "rewards/accuracy_reward": 0.5208333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 501.5208511352539, | |
| "epoch": 0.8906666666666667, | |
| "grad_norm": 0.40998944640159607, | |
| "kl": 0.1203460693359375, | |
| "learning_rate": 2.035922155574466e-06, | |
| "loss": -0.0339, | |
| "reward": 0.7083333432674408, | |
| "reward_std": 0.18404608592391014, | |
| "rewards/accuracy_reward": 0.7083333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 844.4166870117188, | |
| "epoch": 0.8933333333333333, | |
| "grad_norm": 12.733755111694336, | |
| "kl": 0.199462890625, | |
| "learning_rate": 2.0293958470708033e-06, | |
| "loss": -0.001, | |
| "reward": 0.6250000149011612, | |
| "reward_std": 0.3506578914821148, | |
| "rewards/accuracy_reward": 0.6250000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 662.5833435058594, | |
| "epoch": 0.896, | |
| "grad_norm": 2.5601916313171387, | |
| "kl": 0.1180419921875, | |
| "learning_rate": 2.022858070982723e-06, | |
| "loss": 0.0709, | |
| "reward": 0.7083333432674408, | |
| "reward_std": 0.2861081510782242, | |
| "rewards/accuracy_reward": 0.7083333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 858.6041870117188, | |
| "epoch": 0.8986666666666666, | |
| "grad_norm": 0.2826376259326935, | |
| "kl": 0.1029052734375, | |
| "learning_rate": 2.016308968929203e-06, | |
| "loss": 0.027, | |
| "reward": 0.5625000055879354, | |
| "reward_std": 0.21764282882213593, | |
| "rewards/accuracy_reward": 0.5625000055879354, | |
| "rewards/format_reward": 0.0, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 878.3125152587891, | |
| "epoch": 0.9013333333333333, | |
| "grad_norm": 0.3674981892108917, | |
| "kl": 0.203125, | |
| "learning_rate": 2.0097486827745623e-06, | |
| "loss": -0.0174, | |
| "reward": 0.4791666865348816, | |
| "reward_std": 0.3720077611505985, | |
| "rewards/accuracy_reward": 0.4791666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 714.2083587646484, | |
| "epoch": 0.904, | |
| "grad_norm": 0.7770251631736755, | |
| "kl": 0.24627685546875, | |
| "learning_rate": 2.0031773546253826e-06, | |
| "loss": 0.0725, | |
| "reward": 0.6041666716337204, | |
| "reward_std": 0.42872630804777145, | |
| "rewards/accuracy_reward": 0.6041666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 593.9791870117188, | |
| "epoch": 0.9066666666666666, | |
| "grad_norm": 0.45360517501831055, | |
| "kl": 0.2352294921875, | |
| "learning_rate": 1.9965951268274372e-06, | |
| "loss": 0.0406, | |
| "reward": 0.5833333507180214, | |
| "reward_std": 0.31314554437994957, | |
| "rewards/accuracy_reward": 0.5833333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 664.5625152587891, | |
| "epoch": 0.9093333333333333, | |
| "grad_norm": 0.1532231569290161, | |
| "kl": 0.172698974609375, | |
| "learning_rate": 1.9900021419626017e-06, | |
| "loss": 0.0248, | |
| "reward": 0.2291666753590107, | |
| "reward_std": 0.1801304928958416, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/format_reward": 0.0, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 617.7083587646484, | |
| "epoch": 0.912, | |
| "grad_norm": 0.21975946426391602, | |
| "kl": 0.29150390625, | |
| "learning_rate": 1.983398542845767e-06, | |
| "loss": -0.0212, | |
| "reward": 0.645833358168602, | |
| "reward_std": 0.38161083683371544, | |
| "rewards/accuracy_reward": 0.645833358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 529.5000228881836, | |
| "epoch": 0.9146666666666666, | |
| "grad_norm": 0.44750016927719116, | |
| "kl": 0.2016754150390625, | |
| "learning_rate": 1.976784472521747e-06, | |
| "loss": 0.0377, | |
| "reward": 0.7083333432674408, | |
| "reward_std": 0.3332235999405384, | |
| "rewards/accuracy_reward": 0.7083333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 683.6458435058594, | |
| "epoch": 0.9173333333333333, | |
| "grad_norm": 0.10972858965396881, | |
| "kl": 0.06365966796875, | |
| "learning_rate": 1.9701600742621796e-06, | |
| "loss": 0.0364, | |
| "reward": 0.6041666865348816, | |
| "reward_std": 0.1801304928958416, | |
| "rewards/accuracy_reward": 0.6041666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 833.9791870117188, | |
| "epoch": 0.92, | |
| "grad_norm": 0.28129515051841736, | |
| "kl": 0.251007080078125, | |
| "learning_rate": 1.963525491562421e-06, | |
| "loss": 0.0688, | |
| "reward": 0.4791666716337204, | |
| "reward_std": 0.25515518710017204, | |
| "rewards/accuracy_reward": 0.4791666716337204, | |
| "rewards/format_reward": 0.0, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 591.0000228881836, | |
| "epoch": 0.9226666666666666, | |
| "grad_norm": 0.10267713665962219, | |
| "kl": 0.1419677734375, | |
| "learning_rate": 1.9568808681384415e-06, | |
| "loss": -0.028, | |
| "reward": 0.33333333395421505, | |
| "reward_std": 0.22155842557549477, | |
| "rewards/accuracy_reward": 0.33333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 557.1250076293945, | |
| "epoch": 0.9253333333333333, | |
| "grad_norm": 0.27456042170524597, | |
| "kl": 0.296875, | |
| "learning_rate": 1.9502263479237084e-06, | |
| "loss": -0.0547, | |
| "reward": 0.6041666865348816, | |
| "reward_std": 0.2996268570423126, | |
| "rewards/accuracy_reward": 0.6041666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 713.6875152587891, | |
| "epoch": 0.928, | |
| "grad_norm": 0.1611674427986145, | |
| "kl": 0.17510986328125, | |
| "learning_rate": 1.9435620750660703e-06, | |
| "loss": 0.022, | |
| "reward": 0.3750000074505806, | |
| "reward_std": 0.12909945845603943, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 541.6041793823242, | |
| "epoch": 0.9306666666666666, | |
| "grad_norm": 0.9292861819267273, | |
| "kl": 0.426513671875, | |
| "learning_rate": 1.9368881939246333e-06, | |
| "loss": 0.0624, | |
| "reward": 0.6666666865348816, | |
| "reward_std": 0.3602609783411026, | |
| "rewards/accuracy_reward": 0.6666666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 698.0625152587891, | |
| "epoch": 0.9333333333333333, | |
| "grad_norm": 0.18113847076892853, | |
| "kl": 0.34326171875, | |
| "learning_rate": 1.9302048490666355e-06, | |
| "loss": 0.0351, | |
| "reward": 0.35416667722165585, | |
| "reward_std": 0.21764283254742622, | |
| "rewards/accuracy_reward": 0.35416667722165585, | |
| "rewards/format_reward": 0.0, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 647.7708435058594, | |
| "epoch": 0.936, | |
| "grad_norm": 0.23358656466007233, | |
| "kl": 0.237548828125, | |
| "learning_rate": 1.923512185264315e-06, | |
| "loss": 0.0175, | |
| "reward": 0.3958333395421505, | |
| "reward_std": 0.41129202395677567, | |
| "rewards/accuracy_reward": 0.3958333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 674.1041870117188, | |
| "epoch": 0.9386666666666666, | |
| "grad_norm": 0.2018204629421234, | |
| "kl": 0.1859283447265625, | |
| "learning_rate": 1.916810347491772e-06, | |
| "loss": 0.0137, | |
| "reward": 0.6250000149011612, | |
| "reward_std": 0.23116153478622437, | |
| "rewards/accuracy_reward": 0.6250000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 844.1875305175781, | |
| "epoch": 0.9413333333333334, | |
| "grad_norm": 0.3149741590023041, | |
| "kl": 0.25860595703125, | |
| "learning_rate": 1.9100994809218323e-06, | |
| "loss": 0.0756, | |
| "reward": 0.37500000186264515, | |
| "reward_std": 0.3332235924899578, | |
| "rewards/accuracy_reward": 0.37500000186264515, | |
| "rewards/format_reward": 0.0, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 625.4583587646484, | |
| "epoch": 0.944, | |
| "grad_norm": 0.41483765840530396, | |
| "kl": 0.3572998046875, | |
| "learning_rate": 1.9033797309228985e-06, | |
| "loss": 0.1117, | |
| "reward": 0.5625000298023224, | |
| "reward_std": 0.2996268458664417, | |
| "rewards/accuracy_reward": 0.5625000298023224, | |
| "rewards/format_reward": 0.0, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 701.3541717529297, | |
| "epoch": 0.9466666666666667, | |
| "grad_norm": 1.9215991497039795, | |
| "kl": 0.716796875, | |
| "learning_rate": 1.8966512430558036e-06, | |
| "loss": 0.1199, | |
| "reward": 0.5625000149011612, | |
| "reward_std": 0.2350771352648735, | |
| "rewards/accuracy_reward": 0.5625000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 874.7916946411133, | |
| "epoch": 0.9493333333333334, | |
| "grad_norm": 2.8515124320983887, | |
| "kl": 1.845062255859375, | |
| "learning_rate": 1.8899141630706564e-06, | |
| "loss": 0.1755, | |
| "reward": 0.6875000223517418, | |
| "reward_std": 0.39208584278821945, | |
| "rewards/accuracy_reward": 0.6875000223517418, | |
| "rewards/format_reward": 0.0, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 766.0833435058594, | |
| "epoch": 0.952, | |
| "grad_norm": 2.6256027221679688, | |
| "kl": 2.3642578125, | |
| "learning_rate": 1.8831686369036859e-06, | |
| "loss": 0.1805, | |
| "reward": 0.39583334140479565, | |
| "reward_std": 0.2350771240890026, | |
| "rewards/accuracy_reward": 0.39583334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 714.0208511352539, | |
| "epoch": 0.9546666666666667, | |
| "grad_norm": 1.6547703742980957, | |
| "kl": 1.00885009765625, | |
| "learning_rate": 1.876414810674079e-06, | |
| "loss": 0.0457, | |
| "reward": 0.5208333432674408, | |
| "reward_std": 0.235077116638422, | |
| "rewards/accuracy_reward": 0.5208333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 609.4583587646484, | |
| "epoch": 0.9573333333333334, | |
| "grad_norm": 0.5665653944015503, | |
| "kl": 0.771728515625, | |
| "learning_rate": 1.8696528306808168e-06, | |
| "loss": 0.0998, | |
| "reward": 0.4583333432674408, | |
| "reward_std": 0.18404607102274895, | |
| "rewards/accuracy_reward": 0.4583333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 730.7291870117188, | |
| "epoch": 0.96, | |
| "grad_norm": 1.261367917060852, | |
| "kl": 0.23529052734375, | |
| "learning_rate": 1.8628828433995015e-06, | |
| "loss": 0.03, | |
| "reward": 0.6250000149011612, | |
| "reward_std": 0.22155843302607536, | |
| "rewards/accuracy_reward": 0.6250000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 808.5625305175781, | |
| "epoch": 0.9626666666666667, | |
| "grad_norm": 1.5878499746322632, | |
| "kl": 0.54681396484375, | |
| "learning_rate": 1.8561049954791895e-06, | |
| "loss": 0.0336, | |
| "reward": 0.5208333358168602, | |
| "reward_std": 0.13301505148410797, | |
| "rewards/accuracy_reward": 0.5208333358168602, | |
| "rewards/format_reward": 0.0, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 815.1041870117188, | |
| "epoch": 0.9653333333333334, | |
| "grad_norm": 0.3808901309967041, | |
| "kl": 0.371826171875, | |
| "learning_rate": 1.8493194337392087e-06, | |
| "loss": 0.0859, | |
| "reward": 0.541666679084301, | |
| "reward_std": 0.16661179810762405, | |
| "rewards/accuracy_reward": 0.541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 522.4791870117188, | |
| "epoch": 0.968, | |
| "grad_norm": 0.30578020215034485, | |
| "kl": 0.1224365234375, | |
| "learning_rate": 1.8425263051659837e-06, | |
| "loss": 0.0237, | |
| "reward": 0.5000000074505806, | |
| "reward_std": 0.18404608592391014, | |
| "rewards/accuracy_reward": 0.5000000074505806, | |
| "rewards/format_reward": 0.0, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 668.5000076293945, | |
| "epoch": 0.9706666666666667, | |
| "grad_norm": 0.26360616087913513, | |
| "kl": 0.136077880859375, | |
| "learning_rate": 1.8357257569098473e-06, | |
| "loss": 0.0111, | |
| "reward": 0.5208333507180214, | |
| "reward_std": 0.44616059213876724, | |
| "rewards/accuracy_reward": 0.5208333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 728.0000228881836, | |
| "epoch": 0.9733333333333334, | |
| "grad_norm": 1.045130729675293, | |
| "kl": 0.154876708984375, | |
| "learning_rate": 1.828917936281855e-06, | |
| "loss": 0.0659, | |
| "reward": 0.3333333432674408, | |
| "reward_std": 0.30354245379567146, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 753.1458587646484, | |
| "epoch": 0.976, | |
| "grad_norm": 0.5927110910415649, | |
| "kl": 0.25830078125, | |
| "learning_rate": 1.822102990750595e-06, | |
| "loss": 0.0534, | |
| "reward": 0.3958333507180214, | |
| "reward_std": 0.36417658627033234, | |
| "rewards/accuracy_reward": 0.3958333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 731.2291870117188, | |
| "epoch": 0.9786666666666667, | |
| "grad_norm": 0.9049092531204224, | |
| "kl": 0.2169189453125, | |
| "learning_rate": 1.8152810679389911e-06, | |
| "loss": 0.1457, | |
| "reward": 0.5416666828095913, | |
| "reward_std": 0.3602609820663929, | |
| "rewards/accuracy_reward": 0.5416666828095913, | |
| "rewards/format_reward": 0.0, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 763.1667022705078, | |
| "epoch": 0.9813333333333333, | |
| "grad_norm": 0.8557331562042236, | |
| "kl": 0.7392578125, | |
| "learning_rate": 1.808452315621108e-06, | |
| "loss": 0.0306, | |
| "reward": 0.45833334140479565, | |
| "reward_std": 0.35848909616470337, | |
| "rewards/accuracy_reward": 0.45833334140479565, | |
| "rewards/format_reward": 0.0, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 432.0416793823242, | |
| "epoch": 0.984, | |
| "grad_norm": 0.7425907254219055, | |
| "kl": 0.904541015625, | |
| "learning_rate": 1.8016168817189471e-06, | |
| "loss": 0.0233, | |
| "reward": 0.7916666865348816, | |
| "reward_std": 0.3332235962152481, | |
| "rewards/accuracy_reward": 0.7916666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 543.8125152587891, | |
| "epoch": 0.9866666666666667, | |
| "grad_norm": 1.483704686164856, | |
| "kl": 1.626220703125, | |
| "learning_rate": 1.7947749142992453e-06, | |
| "loss": 0.1434, | |
| "reward": 0.6458333507180214, | |
| "reward_std": 0.35457349941134453, | |
| "rewards/accuracy_reward": 0.6458333507180214, | |
| "rewards/format_reward": 0.0, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 750.6458435058594, | |
| "epoch": 0.9893333333333333, | |
| "grad_norm": 1.1710835695266724, | |
| "kl": 1.1573486328125, | |
| "learning_rate": 1.7879265615702653e-06, | |
| "loss": 0.1069, | |
| "reward": 0.4375000149011612, | |
| "reward_std": 0.38161085173487663, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 821.9583435058594, | |
| "epoch": 0.992, | |
| "grad_norm": 2.896385431289673, | |
| "kl": 3.43359375, | |
| "learning_rate": 1.7810719718785873e-06, | |
| "loss": 0.193, | |
| "reward": 0.31250000558793545, | |
| "reward_std": 0.3074580393731594, | |
| "rewards/accuracy_reward": 0.31250000558793545, | |
| "rewards/format_reward": 0.0, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 663.1875076293945, | |
| "epoch": 0.9946666666666667, | |
| "grad_norm": 2.6439497470855713, | |
| "kl": 3.552734375, | |
| "learning_rate": 1.7742112937058924e-06, | |
| "loss": 0.1882, | |
| "reward": 0.4583333395421505, | |
| "reward_std": 0.4248107075691223, | |
| "rewards/accuracy_reward": 0.4583333395421505, | |
| "rewards/format_reward": 0.0, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 633.2916793823242, | |
| "epoch": 0.9973333333333333, | |
| "grad_norm": 0.7656214237213135, | |
| "kl": 0.66461181640625, | |
| "learning_rate": 1.76734467566575e-06, | |
| "loss": 0.0844, | |
| "reward": 0.7291666865348816, | |
| "reward_std": 0.21764283999800682, | |
| "rewards/accuracy_reward": 0.7291666865348816, | |
| "rewards/format_reward": 0.0, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 620.1041793823242, | |
| "epoch": 1.0, | |
| "grad_norm": 1.1568132638931274, | |
| "kl": 1.1962890625, | |
| "learning_rate": 1.7604722665003958e-06, | |
| "loss": 0.1247, | |
| "reward": 0.7500000149011612, | |
| "reward_std": 0.4152076169848442, | |
| "rewards/accuracy_reward": 0.7500000149011612, | |
| "rewards/format_reward": 0.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_completion_length": 723.7617282104492, | |
| "eval_kl": 1.5607896118164062, | |
| "eval_loss": 0.08730700612068176, | |
| "eval_reward": 0.4726666794717312, | |
| "eval_reward_std": 0.29052443864941596, | |
| "eval_rewards/accuracy_reward": 0.4726666794717312, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 30137.8072, | |
| "eval_samples_per_second": 0.066, | |
| "eval_steps_per_second": 0.006, | |
| "step": 375 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 750, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 3, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |