| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5, | |
| "eval_steps": 500, | |
| "global_step": 287, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 243.21429443359375, | |
| "epoch": 0.0017421602787456446, | |
| "grad_norm": 11.870542526245117, | |
| "kl": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "reward": 1.1250000596046448, | |
| "reward_std": 0.35204461216926575, | |
| "rewards/accuracy_reward": 0.2321428693830967, | |
| "rewards/format_reward": 0.892857164144516, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 232.3928680419922, | |
| "epoch": 0.003484320557491289, | |
| "grad_norm": 21.013626098632812, | |
| "kl": 0.002597808837890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.0535714626312256, | |
| "reward_std": 0.4149572253227234, | |
| "rewards/accuracy_reward": 0.196428582072258, | |
| "rewards/format_reward": 0.8571429252624512, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 314.5893020629883, | |
| "epoch": 0.005226480836236934, | |
| "grad_norm": 8.975765228271484, | |
| "kl": 0.01239013671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "reward": 1.0178571939468384, | |
| "reward_std": 0.2851574905216694, | |
| "rewards/accuracy_reward": 0.1071428656578064, | |
| "rewards/format_reward": 0.910714328289032, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 284.1964416503906, | |
| "epoch": 0.006968641114982578, | |
| "grad_norm": 4.65426778793335, | |
| "kl": 0.008523941040039062, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.2610500454902649, | |
| "rewards/accuracy_reward": 0.196428582072258, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 298.21429443359375, | |
| "epoch": 0.008710801393728223, | |
| "grad_norm": 1.674553632736206, | |
| "kl": 0.00372314453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.1250000596046448, | |
| "reward_std": 0.3193712383508682, | |
| "rewards/accuracy_reward": 0.1785714402794838, | |
| "rewards/format_reward": 0.9464285969734192, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 286.9464340209961, | |
| "epoch": 0.010452961672473868, | |
| "grad_norm": 1.6619535684585571, | |
| "kl": 0.000659942626953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 1.2678571939468384, | |
| "reward_std": 0.3606105446815491, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 323.1964416503906, | |
| "epoch": 0.012195121951219513, | |
| "grad_norm": 1.0895068645477295, | |
| "kl": 0.005126953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.1071429252624512, | |
| "reward_std": 0.1428571529686451, | |
| "rewards/accuracy_reward": 0.1428571492433548, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 288.1428756713867, | |
| "epoch": 0.013937282229965157, | |
| "grad_norm": 1.8034625053405762, | |
| "kl": 0.0026731491088867188, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.2142857909202576, | |
| "reward_std": 0.2967643216252327, | |
| "rewards/accuracy_reward": 0.232142873108387, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 263.23216247558594, | |
| "epoch": 0.0156794425087108, | |
| "grad_norm": 0.7990542054176331, | |
| "kl": 0.005615234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.1071429252624512, | |
| "reward_std": 0.18409645557403564, | |
| "rewards/accuracy_reward": 0.1428571492433548, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 201.53572845458984, | |
| "epoch": 0.017421602787456445, | |
| "grad_norm": 1.5415644645690918, | |
| "kl": 0.0062713623046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.3571429252624512, | |
| "reward_std": 0.2363857999444008, | |
| "rewards/accuracy_reward": 0.3571428656578064, | |
| "rewards/format_reward": 1.0, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 73.30357551574707, | |
| "epoch": 0.01916376306620209, | |
| "grad_norm": 1.2912265062332153, | |
| "kl": 0.011810302734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "reward": 1.2500000596046448, | |
| "reward_std": 0.1428571529686451, | |
| "rewards/accuracy_reward": 0.25000000931322575, | |
| "rewards/format_reward": 1.0, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 131.8214340209961, | |
| "epoch": 0.020905923344947737, | |
| "grad_norm": 0.8542214035987854, | |
| "kl": 0.009796142578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.1250000596046448, | |
| "reward_std": 0.13981622830033302, | |
| "rewards/accuracy_reward": 0.160714291036129, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 133.25000762939453, | |
| "epoch": 0.02264808362369338, | |
| "grad_norm": 1.5221061706542969, | |
| "kl": 0.014312744140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.2142857313156128, | |
| "reward_std": 0.2253357619047165, | |
| "rewards/accuracy_reward": 0.2142857313156128, | |
| "rewards/format_reward": 1.0, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 83.32143020629883, | |
| "epoch": 0.024390243902439025, | |
| "grad_norm": 0.7153174877166748, | |
| "kl": 0.0189208984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.1964285969734192, | |
| "reward_std": 0.07695358991622925, | |
| "rewards/accuracy_reward": 0.1964285746216774, | |
| "rewards/format_reward": 1.0, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 96.44642925262451, | |
| "epoch": 0.02613240418118467, | |
| "grad_norm": 1.98847496509552, | |
| "kl": 0.01861572265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.071428656578064, | |
| "reward_std": 0.1428571492433548, | |
| "rewards/accuracy_reward": 0.0892857201397419, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 100.4285774230957, | |
| "epoch": 0.027874564459930314, | |
| "grad_norm": 1.2997009754180908, | |
| "kl": 0.0174560546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.2321429252624512, | |
| "reward_std": 0.14838216826319695, | |
| "rewards/accuracy_reward": 0.23214287497103214, | |
| "rewards/format_reward": 1.0, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 126.35714721679688, | |
| "epoch": 0.029616724738675958, | |
| "grad_norm": 1.5723376274108887, | |
| "kl": 0.01885986328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.3214285969734192, | |
| "reward_std": 0.24695908278226852, | |
| "rewards/accuracy_reward": 0.3392857313156128, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 128.23215103149414, | |
| "epoch": 0.0313588850174216, | |
| "grad_norm": 1.6021169424057007, | |
| "kl": 0.016815185546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.2142857909202576, | |
| "reward_std": 0.18409645557403564, | |
| "rewards/accuracy_reward": 0.2321428656578064, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 103.39286041259766, | |
| "epoch": 0.033101045296167246, | |
| "grad_norm": 1.91715407371521, | |
| "kl": 0.028564453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.1250000596046448, | |
| "reward_std": 0.21981074661016464, | |
| "rewards/accuracy_reward": 0.1428571492433548, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 91.7500057220459, | |
| "epoch": 0.03484320557491289, | |
| "grad_norm": 2.3495121002197266, | |
| "kl": 0.0179443359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.1071428656578064, | |
| "reward_std": 0.11266788095235825, | |
| "rewards/accuracy_reward": 0.12500000931322575, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 85.98214721679688, | |
| "epoch": 0.036585365853658534, | |
| "grad_norm": 2.7929649353027344, | |
| "kl": 0.0169677734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.2142857313156128, | |
| "reward_std": 0.33800363540649414, | |
| "rewards/accuracy_reward": 0.2321428656578064, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 108.64286422729492, | |
| "epoch": 0.03832752613240418, | |
| "grad_norm": 1.8589195013046265, | |
| "kl": 0.016265869140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.1964285969734192, | |
| "reward_std": 0.2937234044075012, | |
| "rewards/accuracy_reward": 0.2142857164144516, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 110.83928680419922, | |
| "epoch": 0.04006968641114982, | |
| "grad_norm": 2.335958957672119, | |
| "kl": 0.01531982421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.1964285969734192, | |
| "reward_std": 0.23086076974868774, | |
| "rewards/accuracy_reward": 0.2142857164144516, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 137.08929061889648, | |
| "epoch": 0.041811846689895474, | |
| "grad_norm": 1.5521477460861206, | |
| "kl": 0.02178955078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.0892857909202576, | |
| "reward_std": 0.17651408910751343, | |
| "rewards/accuracy_reward": 0.1071428619325161, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 79.53571891784668, | |
| "epoch": 0.04355400696864112, | |
| "grad_norm": 1.3790969848632812, | |
| "kl": 0.0279541015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.2142857909202576, | |
| "reward_std": 0.19514649361371994, | |
| "rewards/accuracy_reward": 0.2142857238650322, | |
| "rewards/format_reward": 1.0, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 218.87500762939453, | |
| "epoch": 0.04529616724738676, | |
| "grad_norm": 0.9893456101417542, | |
| "kl": 0.014007568359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.1785714402794838, | |
| "rewards/accuracy_reward": 0.196428582072258, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 65.67857551574707, | |
| "epoch": 0.047038327526132406, | |
| "grad_norm": 1.9509761333465576, | |
| "kl": 0.02667236328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.1428571939468384, | |
| "reward_std": 0.18409645557403564, | |
| "rewards/accuracy_reward": 0.1428571492433548, | |
| "rewards/format_reward": 1.0, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 103.58928680419922, | |
| "epoch": 0.04878048780487805, | |
| "grad_norm": 1.6537208557128906, | |
| "kl": 0.03692626953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.2500000596046448, | |
| "reward_std": 0.2253357544541359, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 48.07143211364746, | |
| "epoch": 0.050522648083623695, | |
| "grad_norm": 2.2078816890716553, | |
| "kl": 0.03143310546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.321428656578064, | |
| "reward_std": 0.2253357619047165, | |
| "rewards/accuracy_reward": 0.3392857313156128, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 55.160715103149414, | |
| "epoch": 0.05226480836236934, | |
| "grad_norm": 2.4559199810028076, | |
| "kl": 0.03167724609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.2321429252624512, | |
| "reward_std": 0.2610500305891037, | |
| "rewards/accuracy_reward": 0.2321428656578064, | |
| "rewards/format_reward": 1.0, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 131.6964340209961, | |
| "epoch": 0.05400696864111498, | |
| "grad_norm": 1.1578391790390015, | |
| "kl": 0.02935791015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.20670335739850998, | |
| "rewards/accuracy_reward": 0.1964285857975483, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 46.571431159973145, | |
| "epoch": 0.05574912891986063, | |
| "grad_norm": 3.844055652618408, | |
| "kl": 0.0350341796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.3214285969734192, | |
| "reward_std": 0.19514648616313934, | |
| "rewards/accuracy_reward": 0.321428582072258, | |
| "rewards/format_reward": 1.0, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 66.50000190734863, | |
| "epoch": 0.05749128919860627, | |
| "grad_norm": 1.843333125114441, | |
| "kl": 0.02716064453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.1964285969734192, | |
| "reward_std": 0.14838216826319695, | |
| "rewards/accuracy_reward": 0.196428582072258, | |
| "rewards/format_reward": 1.0, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 123.50000762939453, | |
| "epoch": 0.059233449477351915, | |
| "grad_norm": 1.1580967903137207, | |
| "kl": 0.027099609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.1071428619325161, | |
| "rewards/accuracy_reward": 0.17857144214212894, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 145.44643783569336, | |
| "epoch": 0.06097560975609756, | |
| "grad_norm": 0.6297839879989624, | |
| "kl": 0.02728271484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.1250000596046448, | |
| "reward_std": 0.1071428619325161, | |
| "rewards/accuracy_reward": 0.14285714365541935, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 175.9285774230957, | |
| "epoch": 0.0627177700348432, | |
| "grad_norm": 1.174981951713562, | |
| "kl": 0.023681640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.2500000596046448, | |
| "reward_std": 0.2580091208219528, | |
| "rewards/accuracy_reward": 0.3035714328289032, | |
| "rewards/format_reward": 0.9464285969734192, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 122.89286041259766, | |
| "epoch": 0.06445993031358885, | |
| "grad_norm": 0.48167216777801514, | |
| "kl": 0.03033447265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.3750000596046448, | |
| "reward_std": 0.13981622830033302, | |
| "rewards/accuracy_reward": 0.4107142984867096, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 54.08928871154785, | |
| "epoch": 0.06620209059233449, | |
| "grad_norm": 2.2061703205108643, | |
| "kl": 0.02490234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.2142857313156128, | |
| "reward_std": 0.11266787722706795, | |
| "rewards/accuracy_reward": 0.2142857164144516, | |
| "rewards/format_reward": 1.0, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 90.75000381469727, | |
| "epoch": 0.06794425087108014, | |
| "grad_norm": 1.4908874034881592, | |
| "kl": 0.02630615234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.1181928962469101, | |
| "rewards/accuracy_reward": 0.1607142984867096, | |
| "rewards/format_reward": 1.0, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 76.19643211364746, | |
| "epoch": 0.06968641114982578, | |
| "grad_norm": 1.1106164455413818, | |
| "kl": 0.03558349609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.1428571939468384, | |
| "reward_std": 0.1428571529686451, | |
| "rewards/accuracy_reward": 0.160714291036129, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 93.75000381469727, | |
| "epoch": 0.07142857142857142, | |
| "grad_norm": 2.4737675189971924, | |
| "kl": 0.02557373046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.2142857909202576, | |
| "reward_std": 0.21676981449127197, | |
| "rewards/accuracy_reward": 0.232142873108387, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 87.07143592834473, | |
| "epoch": 0.07317073170731707, | |
| "grad_norm": 2.791724443435669, | |
| "kl": 0.0623779296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.410714328289032, | |
| "reward_std": 0.1785714402794838, | |
| "rewards/accuracy_reward": 0.4107142984867096, | |
| "rewards/format_reward": 1.0, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 71.01786041259766, | |
| "epoch": 0.07491289198606271, | |
| "grad_norm": 3.068845748901367, | |
| "kl": 0.02935791015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.1428571939468384, | |
| "reward_std": 0.1539071835577488, | |
| "rewards/accuracy_reward": 0.16071429289877415, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 102.42857360839844, | |
| "epoch": 0.07665505226480836, | |
| "grad_norm": 3.0162665843963623, | |
| "kl": 0.026123046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.2500000596046448, | |
| "reward_std": 0.2142857313156128, | |
| "rewards/accuracy_reward": 0.2500000149011612, | |
| "rewards/format_reward": 1.0, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 140.58929443359375, | |
| "epoch": 0.078397212543554, | |
| "grad_norm": 3.6954751014709473, | |
| "kl": 0.032470703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.2321429252624512, | |
| "reward_std": 0.1785714402794838, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 0.9464285969734192, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 105.83929061889648, | |
| "epoch": 0.08013937282229965, | |
| "grad_norm": 1.9819762706756592, | |
| "kl": 0.04046630859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.3928571939468384, | |
| "reward_std": 0.19514650478959084, | |
| "rewards/accuracy_reward": 0.392857164144516, | |
| "rewards/format_reward": 1.0, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 157.3928680419922, | |
| "epoch": 0.08188153310104529, | |
| "grad_norm": 1.7190651893615723, | |
| "kl": 0.02734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.2142857909202576, | |
| "reward_std": 0.2253357619047165, | |
| "rewards/accuracy_reward": 0.267857164144516, | |
| "rewards/format_reward": 0.9464285969734192, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 115.51786422729492, | |
| "epoch": 0.08362369337979095, | |
| "grad_norm": 0.6601378321647644, | |
| "kl": 0.02496337890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.1071429252624512, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.10714286006987095, | |
| "rewards/format_reward": 1.0, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 121.14286041259766, | |
| "epoch": 0.08536585365853659, | |
| "grad_norm": 1.0855175256729126, | |
| "kl": 0.025146484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.2142857313156128, | |
| "reward_std": 0.0824786126613617, | |
| "rewards/accuracy_reward": 0.2142857313156128, | |
| "rewards/format_reward": 1.0, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 122.62500762939453, | |
| "epoch": 0.08710801393728224, | |
| "grad_norm": 1.4344055652618408, | |
| "kl": 0.0289306640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.0178571939468384, | |
| "reward_std": 0.0357142873108387, | |
| "rewards/accuracy_reward": 0.01785714365541935, | |
| "rewards/format_reward": 1.0, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 157.75000762939453, | |
| "epoch": 0.08885017421602788, | |
| "grad_norm": 0.6275267004966736, | |
| "kl": 0.016571044921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.3571429252624512, | |
| "reward_std": 0.1428571529686451, | |
| "rewards/accuracy_reward": 0.3571428805589676, | |
| "rewards/format_reward": 1.0, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 191.50000762939453, | |
| "epoch": 0.09059233449477352, | |
| "grad_norm": 0.9530377984046936, | |
| "kl": 0.0240478515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.1785714626312256, | |
| "reward_std": 0.18409645557403564, | |
| "rewards/accuracy_reward": 0.19642857648432255, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 119.6785774230957, | |
| "epoch": 0.09233449477351917, | |
| "grad_norm": 1.568297266960144, | |
| "kl": 0.01593017578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.4821429252624512, | |
| "reward_std": 0.1785714402794838, | |
| "rewards/accuracy_reward": 0.4821428954601288, | |
| "rewards/format_reward": 1.0, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 192.0357208251953, | |
| "epoch": 0.09407665505226481, | |
| "grad_norm": 1.9954557418823242, | |
| "kl": 0.056884765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.4285714626312256, | |
| "reward_std": 0.2253357619047165, | |
| "rewards/accuracy_reward": 0.446428582072258, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 186.9821548461914, | |
| "epoch": 0.09581881533101046, | |
| "grad_norm": 3.1780290603637695, | |
| "kl": 0.0447998046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.2678571939468384, | |
| "reward_std": 0.25248411297798157, | |
| "rewards/accuracy_reward": 0.321428582072258, | |
| "rewards/format_reward": 0.9464285969734192, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 214.80358123779297, | |
| "epoch": 0.0975609756097561, | |
| "grad_norm": 1.200026273727417, | |
| "kl": 0.0296630859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.285714328289032, | |
| "reward_std": 0.2363857999444008, | |
| "rewards/accuracy_reward": 0.3035714477300644, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 141.85714721679688, | |
| "epoch": 0.09930313588850175, | |
| "grad_norm": 1.5381468534469604, | |
| "kl": 0.04248046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.3035715222358704, | |
| "reward_std": 0.14838216453790665, | |
| "rewards/accuracy_reward": 0.3035714477300644, | |
| "rewards/format_reward": 1.0, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 89.01786041259766, | |
| "epoch": 0.10104529616724739, | |
| "grad_norm": 3.5556106567382812, | |
| "kl": 0.03363037109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.4285714626312256, | |
| "reward_std": 0.2253357470035553, | |
| "rewards/accuracy_reward": 0.4285714477300644, | |
| "rewards/format_reward": 1.0, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 94.07143020629883, | |
| "epoch": 0.10278745644599303, | |
| "grad_norm": 0.8911753296852112, | |
| "kl": 0.0328369140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.2142857909202576, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.2321428656578064, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 113.00000762939453, | |
| "epoch": 0.10452961672473868, | |
| "grad_norm": 2.696057081222534, | |
| "kl": 0.0401611328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.0357142873108387, | |
| "rewards/accuracy_reward": 0.160714291036129, | |
| "rewards/format_reward": 1.0, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 143.46429061889648, | |
| "epoch": 0.10627177700348432, | |
| "grad_norm": 2.277161121368408, | |
| "kl": 0.0289306640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.196428656578064, | |
| "reward_std": 0.1896214708685875, | |
| "rewards/accuracy_reward": 0.2142857201397419, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 41.250000953674316, | |
| "epoch": 0.10801393728222997, | |
| "grad_norm": 0.5966669321060181, | |
| "kl": 0.041748046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.410714328289032, | |
| "reward_std": 0.07695358991622925, | |
| "rewards/accuracy_reward": 0.4107143133878708, | |
| "rewards/format_reward": 1.0, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 106.0535774230957, | |
| "epoch": 0.10975609756097561, | |
| "grad_norm": 1.0405926704406738, | |
| "kl": 0.03521728515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.3214285969734192, | |
| "reward_std": 0.1428571529686451, | |
| "rewards/accuracy_reward": 0.321428582072258, | |
| "rewards/format_reward": 1.0, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 100.30357551574707, | |
| "epoch": 0.11149825783972125, | |
| "grad_norm": 2.6442556381225586, | |
| "kl": 0.043701171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.2142857313156128, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.2142857164144516, | |
| "rewards/format_reward": 1.0, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 127.28572463989258, | |
| "epoch": 0.1132404181184669, | |
| "grad_norm": 1.4323869943618774, | |
| "kl": 0.0272216796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.13527478277683258, | |
| "rewards/accuracy_reward": 0.1964285746216774, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 91.3750057220459, | |
| "epoch": 0.11498257839721254, | |
| "grad_norm": 1.4564169645309448, | |
| "kl": 0.0450439453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.2500000596046448, | |
| "reward_std": 0.1539071835577488, | |
| "rewards/accuracy_reward": 0.267857164144516, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 110.78572082519531, | |
| "epoch": 0.11672473867595819, | |
| "grad_norm": 3.4178943634033203, | |
| "kl": 0.03350830078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.3392857909202576, | |
| "reward_std": 0.1785714402794838, | |
| "rewards/accuracy_reward": 0.3392857313156128, | |
| "rewards/format_reward": 1.0, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 54.58928871154785, | |
| "epoch": 0.11846689895470383, | |
| "grad_norm": 2.3461058139801025, | |
| "kl": 0.0557861328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.1071428619325161, | |
| "rewards/accuracy_reward": 0.1607142984867096, | |
| "rewards/format_reward": 1.0, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 198.821439743042, | |
| "epoch": 0.12020905923344948, | |
| "grad_norm": 1.9061115980148315, | |
| "kl": 0.0479736328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.0892857909202576, | |
| "reward_std": 0.16546404361724854, | |
| "rewards/accuracy_reward": 0.1428571492433548, | |
| "rewards/format_reward": 0.9464285969734192, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 59.28571701049805, | |
| "epoch": 0.12195121951219512, | |
| "grad_norm": 2.6947081089019775, | |
| "kl": 0.0968017578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "reward": 1.1607143878936768, | |
| "reward_std": 0.1071428619325161, | |
| "rewards/accuracy_reward": 0.16071429289877415, | |
| "rewards/format_reward": 1.0, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 46.14285850524902, | |
| "epoch": 0.12369337979094076, | |
| "grad_norm": 3.665738582611084, | |
| "kl": 0.0404052734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.25, | |
| "reward_std": 0.18409645557403564, | |
| "rewards/accuracy_reward": 0.2678571492433548, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 49.37500190734863, | |
| "epoch": 0.1254355400696864, | |
| "grad_norm": 3.0653085708618164, | |
| "kl": 0.02880859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.2142857313156128, | |
| "reward_std": 0.11266788095235825, | |
| "rewards/accuracy_reward": 0.2142857313156128, | |
| "rewards/format_reward": 1.0, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 127.98215103149414, | |
| "epoch": 0.12717770034843207, | |
| "grad_norm": 0.9887677431106567, | |
| "kl": 0.03167724609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.2142857909202576, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.232142873108387, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 48.66071701049805, | |
| "epoch": 0.1289198606271777, | |
| "grad_norm": 0.8168498873710632, | |
| "kl": 0.03155517578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.0357142873108387, | |
| "rewards/accuracy_reward": 0.160714291036129, | |
| "rewards/format_reward": 1.0, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 133.78571701049805, | |
| "epoch": 0.13066202090592335, | |
| "grad_norm": 0.4777332544326782, | |
| "kl": 0.03472900390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.1428571939468384, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.160714291036129, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 92.28572082519531, | |
| "epoch": 0.13240418118466898, | |
| "grad_norm": 1.1240160465240479, | |
| "kl": 0.03271484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.2678571939468384, | |
| "reward_std": 0.14838216453790665, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 116.6785774230957, | |
| "epoch": 0.13414634146341464, | |
| "grad_norm": 2.4515292644500732, | |
| "kl": 0.044189453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.2678571939468384, | |
| "reward_std": 0.14838216453790665, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 72.44643020629883, | |
| "epoch": 0.13588850174216027, | |
| "grad_norm": 2.802521228790283, | |
| "kl": 0.0426025390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.25, | |
| "reward_std": 0.1539071798324585, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 105.0000057220459, | |
| "epoch": 0.13763066202090593, | |
| "grad_norm": 2.8100438117980957, | |
| "kl": 0.03338623046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.285714328289032, | |
| "reward_std": 0.24695907905697823, | |
| "rewards/accuracy_reward": 0.3035714477300644, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 52.67857360839844, | |
| "epoch": 0.13937282229965156, | |
| "grad_norm": 2.374569892883301, | |
| "kl": 0.0562744140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.3928571939468384, | |
| "reward_std": 0.18409645557403564, | |
| "rewards/accuracy_reward": 0.392857164144516, | |
| "rewards/format_reward": 1.0, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 40.535715103149414, | |
| "epoch": 0.14111498257839722, | |
| "grad_norm": 1.0314127206802368, | |
| "kl": 0.0440673828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.3571429252624512, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.35714287497103214, | |
| "rewards/format_reward": 1.0, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 81.91071701049805, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.6449408531188965, | |
| "kl": 0.03765869140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.3214285969734192, | |
| "reward_std": 0.04123930633068085, | |
| "rewards/accuracy_reward": 0.3214285969734192, | |
| "rewards/format_reward": 1.0, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 111.01786422729492, | |
| "epoch": 0.1445993031358885, | |
| "grad_norm": 4.188468933105469, | |
| "kl": 0.033203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.321428656578064, | |
| "reward_std": 0.2142857238650322, | |
| "rewards/accuracy_reward": 0.3392857313156128, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 98.83929061889648, | |
| "epoch": 0.14634146341463414, | |
| "grad_norm": 2.2457661628723145, | |
| "kl": 0.03643798828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.410714328289032, | |
| "reward_std": 0.1896214708685875, | |
| "rewards/accuracy_reward": 0.4285714477300644, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 76.16071701049805, | |
| "epoch": 0.1480836236933798, | |
| "grad_norm": 1.101365566253662, | |
| "kl": 0.0364990234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.5535714626312256, | |
| "reward_std": 0.07695358991622925, | |
| "rewards/accuracy_reward": 0.5535714626312256, | |
| "rewards/format_reward": 1.0, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 97.35714721679688, | |
| "epoch": 0.14982578397212543, | |
| "grad_norm": 3.1208243370056152, | |
| "kl": 0.038818359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.2678571939468384, | |
| "reward_std": 0.1785714402794838, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 42.07143020629883, | |
| "epoch": 0.15156794425087108, | |
| "grad_norm": 0.7317284941673279, | |
| "kl": 0.0546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.0714285969734192, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0714285746216774, | |
| "rewards/format_reward": 1.0, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 134.0178623199463, | |
| "epoch": 0.15331010452961671, | |
| "grad_norm": 2.284250259399414, | |
| "kl": 0.038818359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.1785714626312256, | |
| "reward_std": 0.17098908126354218, | |
| "rewards/accuracy_reward": 0.2142857238650322, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 71.32143020629883, | |
| "epoch": 0.15505226480836237, | |
| "grad_norm": 2.4166319370269775, | |
| "kl": 0.0408935546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.285714328289032, | |
| "reward_std": 0.18409644439816475, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 1.0, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 49.76785850524902, | |
| "epoch": 0.156794425087108, | |
| "grad_norm": 0.20406803488731384, | |
| "kl": 0.041015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.0714285969734192, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0714285746216774, | |
| "rewards/format_reward": 1.0, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 41.28571701049805, | |
| "epoch": 0.15853658536585366, | |
| "grad_norm": 1.4971178770065308, | |
| "kl": 0.0352783203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.6250001192092896, | |
| "reward_std": 0.07695358991622925, | |
| "rewards/accuracy_reward": 0.6250000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 88.53572082519531, | |
| "epoch": 0.1602787456445993, | |
| "grad_norm": 3.6174209117889404, | |
| "kl": 0.03338623046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.4464285969734192, | |
| "reward_std": 0.23086077719926834, | |
| "rewards/accuracy_reward": 0.4464285969734192, | |
| "rewards/format_reward": 1.0, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 88.9464340209961, | |
| "epoch": 0.16202090592334495, | |
| "grad_norm": 0.5585963726043701, | |
| "kl": 0.0479736328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.2678571939468384, | |
| "reward_std": 0.0357142873108387, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 100.58929061889648, | |
| "epoch": 0.16376306620209058, | |
| "grad_norm": 0.45368415117263794, | |
| "kl": 0.0296630859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.3392857909202576, | |
| "reward_std": 0.1071428619325161, | |
| "rewards/accuracy_reward": 0.3392857313156128, | |
| "rewards/format_reward": 1.0, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 77.55357360839844, | |
| "epoch": 0.16550522648083624, | |
| "grad_norm": 1.1148852109909058, | |
| "kl": 0.03955078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.2500000596046448, | |
| "reward_std": 0.11266787722706795, | |
| "rewards/accuracy_reward": 0.2500000149011612, | |
| "rewards/format_reward": 1.0, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 87.23214721679688, | |
| "epoch": 0.1672473867595819, | |
| "grad_norm": 2.378681182861328, | |
| "kl": 0.0572509765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.3035715222358704, | |
| "reward_std": 0.14838216453790665, | |
| "rewards/accuracy_reward": 0.3035714477300644, | |
| "rewards/format_reward": 1.0, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 78.42857551574707, | |
| "epoch": 0.16898954703832753, | |
| "grad_norm": 2.708867073059082, | |
| "kl": 0.0322265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.1785715222358704, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.17857144214212894, | |
| "rewards/format_reward": 1.0, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 91.42857360839844, | |
| "epoch": 0.17073170731707318, | |
| "grad_norm": 1.7530436515808105, | |
| "kl": 0.04833984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.1964285969734192, | |
| "reward_std": 0.07695358991622925, | |
| "rewards/accuracy_reward": 0.1964285746216774, | |
| "rewards/format_reward": 1.0, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 120.48214721679688, | |
| "epoch": 0.17247386759581881, | |
| "grad_norm": 2.45546293258667, | |
| "kl": 0.03021240234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.5000000596046448, | |
| "reward_std": 0.25552502274513245, | |
| "rewards/accuracy_reward": 0.517857164144516, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 189.16072845458984, | |
| "epoch": 0.17421602787456447, | |
| "grad_norm": 1.9118529558181763, | |
| "kl": 0.0391845703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.196428656578064, | |
| "reward_std": 0.15086627006530762, | |
| "rewards/accuracy_reward": 0.2321428656578064, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 74.30357551574707, | |
| "epoch": 0.1759581881533101, | |
| "grad_norm": 3.851543664932251, | |
| "kl": 0.0513916015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.3214285969734192, | |
| "reward_std": 0.2967643216252327, | |
| "rewards/accuracy_reward": 0.3214285969734192, | |
| "rewards/format_reward": 1.0, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 119.73214721679688, | |
| "epoch": 0.17770034843205576, | |
| "grad_norm": 1.3045148849487305, | |
| "kl": 0.040771484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.3928571939468384, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.392857164144516, | |
| "rewards/format_reward": 1.0, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 97.01786041259766, | |
| "epoch": 0.1794425087108014, | |
| "grad_norm": 1.8413246870040894, | |
| "kl": 0.02679443359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.3392857909202576, | |
| "reward_std": 0.07695358991622925, | |
| "rewards/accuracy_reward": 0.3392857313156128, | |
| "rewards/format_reward": 1.0, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 66.01785850524902, | |
| "epoch": 0.18118466898954705, | |
| "grad_norm": 1.7732882499694824, | |
| "kl": 0.0533447265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.3035714626312256, | |
| "reward_std": 0.14838216826319695, | |
| "rewards/accuracy_reward": 0.3035714402794838, | |
| "rewards/format_reward": 1.0, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 106.39286041259766, | |
| "epoch": 0.18292682926829268, | |
| "grad_norm": 1.3419760465621948, | |
| "kl": 0.026611328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.446428656578064, | |
| "reward_std": 0.14838217198848724, | |
| "rewards/accuracy_reward": 0.446428582072258, | |
| "rewards/format_reward": 1.0, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 158.4464340209961, | |
| "epoch": 0.18466898954703834, | |
| "grad_norm": 1.9505287408828735, | |
| "kl": 0.03173828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.2142857313156128, | |
| "reward_std": 0.1539071798324585, | |
| "rewards/accuracy_reward": 0.2321428656578064, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 92.5714340209961, | |
| "epoch": 0.18641114982578397, | |
| "grad_norm": 2.1574547290802, | |
| "kl": 0.0264892578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.3214285969734192, | |
| "reward_std": 0.11266788095235825, | |
| "rewards/accuracy_reward": 0.3214285969734192, | |
| "rewards/format_reward": 1.0, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 91.50000381469727, | |
| "epoch": 0.18815331010452963, | |
| "grad_norm": 0.6726558804512024, | |
| "kl": 0.03436279296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.321428656578064, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.3392857313156128, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 72.64286041259766, | |
| "epoch": 0.18989547038327526, | |
| "grad_norm": 2.206634521484375, | |
| "kl": 0.0567626953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.4821429252624512, | |
| "reward_std": 0.1181928999722004, | |
| "rewards/accuracy_reward": 0.4821428805589676, | |
| "rewards/format_reward": 1.0, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 156.51786422729492, | |
| "epoch": 0.1916376306620209, | |
| "grad_norm": 2.635490655899048, | |
| "kl": 0.047607421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.4285715222358704, | |
| "reward_std": 0.1649572253227234, | |
| "rewards/accuracy_reward": 0.4285714477300644, | |
| "rewards/format_reward": 1.0, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 72.64286041259766, | |
| "epoch": 0.19337979094076654, | |
| "grad_norm": 3.589315414428711, | |
| "kl": 0.043212890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.3928572535514832, | |
| "reward_std": 0.1428571492433548, | |
| "rewards/accuracy_reward": 0.3928571492433548, | |
| "rewards/format_reward": 1.0, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 139.69643783569336, | |
| "epoch": 0.1951219512195122, | |
| "grad_norm": 0.8147983551025391, | |
| "kl": 0.0255126953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.285714328289032, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 1.0, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 173.71429443359375, | |
| "epoch": 0.19686411149825783, | |
| "grad_norm": 3.0332417488098145, | |
| "kl": 0.03369140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.0892857313156128, | |
| "reward_std": 0.14838215708732605, | |
| "rewards/accuracy_reward": 0.1071428656578064, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 204.16072845458984, | |
| "epoch": 0.1986062717770035, | |
| "grad_norm": 2.643592357635498, | |
| "kl": 0.1168212890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0047, | |
| "reward": 1.1071429252624512, | |
| "reward_std": 0.16242313012480736, | |
| "rewards/accuracy_reward": 0.1428571492433548, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 110.07143020629883, | |
| "epoch": 0.20034843205574912, | |
| "grad_norm": 0.8254410624504089, | |
| "kl": 0.036376953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.1428571939468384, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.160714291036129, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 95.58929061889648, | |
| "epoch": 0.20209059233449478, | |
| "grad_norm": 1.296014666557312, | |
| "kl": 0.0499267578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.3214285969734192, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.321428582072258, | |
| "rewards/format_reward": 1.0, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 68.51786041259766, | |
| "epoch": 0.2038327526132404, | |
| "grad_norm": 2.4811439514160156, | |
| "kl": 0.0330810546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.3928571939468384, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.392857164144516, | |
| "rewards/format_reward": 1.0, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 96.14286041259766, | |
| "epoch": 0.20557491289198607, | |
| "grad_norm": 0.46663737297058105, | |
| "kl": 0.0428466796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.3928572535514832, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.3928571492433548, | |
| "rewards/format_reward": 1.0, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 83.26786422729492, | |
| "epoch": 0.2073170731707317, | |
| "grad_norm": 1.1690462827682495, | |
| "kl": 0.037353515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.3035715222358704, | |
| "reward_std": 0.0357142873108387, | |
| "rewards/accuracy_reward": 0.3035714477300644, | |
| "rewards/format_reward": 1.0, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 133.21428680419922, | |
| "epoch": 0.20905923344947736, | |
| "grad_norm": 2.5085010528564453, | |
| "kl": 0.05029296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.2142857909202576, | |
| "reward_std": 0.18409644439816475, | |
| "rewards/accuracy_reward": 0.23214287497103214, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 143.7857208251953, | |
| "epoch": 0.21080139372822299, | |
| "grad_norm": 4.637099742889404, | |
| "kl": 0.045166015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.285714328289032, | |
| "reward_std": 0.1539071798324585, | |
| "rewards/accuracy_reward": 0.3035714477300644, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 64.19643211364746, | |
| "epoch": 0.21254355400696864, | |
| "grad_norm": 1.975265622138977, | |
| "kl": 0.03314208984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.2321428656578064, | |
| "reward_std": 0.14838216453790665, | |
| "rewards/accuracy_reward": 0.2321428656578064, | |
| "rewards/format_reward": 1.0, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 91.57143211364746, | |
| "epoch": 0.21428571428571427, | |
| "grad_norm": 2.885680913925171, | |
| "kl": 0.072509765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.2500000596046448, | |
| "reward_std": 0.1539071872830391, | |
| "rewards/accuracy_reward": 0.2500000149011612, | |
| "rewards/format_reward": 1.0, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 118.39286041259766, | |
| "epoch": 0.21602787456445993, | |
| "grad_norm": 1.6002825498580933, | |
| "kl": 0.0328369140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.1250000596046448, | |
| "reward_std": 0.1071428619325161, | |
| "rewards/accuracy_reward": 0.160714291036129, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 86.53571891784668, | |
| "epoch": 0.21777003484320556, | |
| "grad_norm": 1.9014854431152344, | |
| "kl": 0.0528564453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.2678571939468384, | |
| "reward_std": 0.14838216453790665, | |
| "rewards/accuracy_reward": 0.267857164144516, | |
| "rewards/format_reward": 1.0, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 76.4464340209961, | |
| "epoch": 0.21951219512195122, | |
| "grad_norm": 1.3376573324203491, | |
| "kl": 0.02325439453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.3571429252624512, | |
| "reward_std": 0.18409645557403564, | |
| "rewards/accuracy_reward": 0.3571428656578064, | |
| "rewards/format_reward": 1.0, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 73.57143020629883, | |
| "epoch": 0.22125435540069685, | |
| "grad_norm": 1.281449794769287, | |
| "kl": 0.0526123046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.0535714626312256, | |
| "reward_std": 0.07695358991622925, | |
| "rewards/accuracy_reward": 0.0535714328289032, | |
| "rewards/format_reward": 1.0, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 75.46429061889648, | |
| "epoch": 0.2229965156794425, | |
| "grad_norm": 1.3834271430969238, | |
| "kl": 0.0509033203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.0357142873108387, | |
| "rewards/accuracy_reward": 0.160714291036129, | |
| "rewards/format_reward": 1.0, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 83.48214721679688, | |
| "epoch": 0.22473867595818817, | |
| "grad_norm": 3.221709966659546, | |
| "kl": 0.048583984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.2678571939468384, | |
| "reward_std": 0.29123931378126144, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 82.48214530944824, | |
| "epoch": 0.2264808362369338, | |
| "grad_norm": 2.775517463684082, | |
| "kl": 0.046630859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.4642857313156128, | |
| "reward_std": 0.11266787722706795, | |
| "rewards/accuracy_reward": 0.4642857313156128, | |
| "rewards/format_reward": 1.0, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 106.03571701049805, | |
| "epoch": 0.22822299651567945, | |
| "grad_norm": 0.7142723798751831, | |
| "kl": 0.0277099609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.2500000596046448, | |
| "reward_std": 0.11266787722706795, | |
| "rewards/accuracy_reward": 0.2500000149011612, | |
| "rewards/format_reward": 1.0, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 54.875003814697266, | |
| "epoch": 0.22996515679442509, | |
| "grad_norm": 1.956363320350647, | |
| "kl": 0.0419921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.2321428656578064, | |
| "reward_std": 0.07695359364151955, | |
| "rewards/accuracy_reward": 0.2321428656578064, | |
| "rewards/format_reward": 1.0, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 133.0357208251953, | |
| "epoch": 0.23170731707317074, | |
| "grad_norm": 2.0702030658721924, | |
| "kl": 0.0458984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.3035714626312256, | |
| "reward_std": 0.2610500380396843, | |
| "rewards/accuracy_reward": 0.3392857313156128, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 123.44643783569336, | |
| "epoch": 0.23344947735191637, | |
| "grad_norm": 3.096449136734009, | |
| "kl": 0.0501708984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.2678571939468384, | |
| "reward_std": 0.14838217198848724, | |
| "rewards/accuracy_reward": 0.267857164144516, | |
| "rewards/format_reward": 1.0, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 129.83929061889648, | |
| "epoch": 0.23519163763066203, | |
| "grad_norm": 3.0806472301483154, | |
| "kl": 0.03887939453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.3392857909202576, | |
| "reward_std": 0.1785714365541935, | |
| "rewards/accuracy_reward": 0.3571428805589676, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 134.85715103149414, | |
| "epoch": 0.23693379790940766, | |
| "grad_norm": 2.5985164642333984, | |
| "kl": 0.04425048828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.4285714626312256, | |
| "reward_std": 0.25552502274513245, | |
| "rewards/accuracy_reward": 0.4285714328289032, | |
| "rewards/format_reward": 1.0, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 122.83929061889648, | |
| "epoch": 0.23867595818815332, | |
| "grad_norm": 0.6726248860359192, | |
| "kl": 0.06396484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.3571429252624512, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.3571428805589676, | |
| "rewards/format_reward": 1.0, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 93.76786041259766, | |
| "epoch": 0.24041811846689895, | |
| "grad_norm": 1.511077880859375, | |
| "kl": 0.02423095703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.2857143878936768, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 1.0, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 98.94643020629883, | |
| "epoch": 0.2421602787456446, | |
| "grad_norm": 1.0339641571044922, | |
| "kl": 0.0565185546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.071428656578064, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.07142857648432255, | |
| "rewards/format_reward": 1.0, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 124.57143592834473, | |
| "epoch": 0.24390243902439024, | |
| "grad_norm": 1.6461817026138306, | |
| "kl": 0.0247802734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.3750000596046448, | |
| "reward_std": 0.21981074661016464, | |
| "rewards/accuracy_reward": 0.392857164144516, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 109.5714340209961, | |
| "epoch": 0.2456445993031359, | |
| "grad_norm": 1.4544490575790405, | |
| "kl": 0.042236328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.2321429252624512, | |
| "reward_std": 0.1071428656578064, | |
| "rewards/accuracy_reward": 0.2500000149011612, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 154.23215293884277, | |
| "epoch": 0.24738675958188153, | |
| "grad_norm": 4.0668230056762695, | |
| "kl": 0.02862548828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.196428656578064, | |
| "reward_std": 0.28061603009700775, | |
| "rewards/accuracy_reward": 0.2321428656578064, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 110.10715103149414, | |
| "epoch": 0.24912891986062718, | |
| "grad_norm": 3.707798957824707, | |
| "kl": 0.0501708984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.4107143878936768, | |
| "reward_std": 0.1181928962469101, | |
| "rewards/accuracy_reward": 0.4107142984867096, | |
| "rewards/format_reward": 1.0, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 151.73214721679688, | |
| "epoch": 0.2508710801393728, | |
| "grad_norm": 1.6972342729568481, | |
| "kl": 0.041259765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.2500000596046448, | |
| "reward_std": 0.20117833092808723, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 101.66072082519531, | |
| "epoch": 0.25261324041811845, | |
| "grad_norm": 3.099278211593628, | |
| "kl": 0.141845703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0057, | |
| "reward": 1.1785714626312256, | |
| "reward_std": 0.1428571529686451, | |
| "rewards/accuracy_reward": 0.1785714328289032, | |
| "rewards/format_reward": 1.0, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 211.41071701049805, | |
| "epoch": 0.25435540069686413, | |
| "grad_norm": 1.7294187545776367, | |
| "kl": 0.031005859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.2321429252624512, | |
| "reward_std": 0.1896214783191681, | |
| "rewards/accuracy_reward": 0.2857143059372902, | |
| "rewards/format_reward": 0.9464285969734192, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 53.32143020629883, | |
| "epoch": 0.25609756097560976, | |
| "grad_norm": 2.51180100440979, | |
| "kl": 0.032470703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.196428656578064, | |
| "reward_std": 0.1071428619325161, | |
| "rewards/accuracy_reward": 0.2142857238650322, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 144.10715103149414, | |
| "epoch": 0.2578397212543554, | |
| "grad_norm": 1.118396282196045, | |
| "kl": 0.03857421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.14838217198848724, | |
| "rewards/accuracy_reward": 0.2142857164144516, | |
| "rewards/format_reward": 0.9464285969734192, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 209.16072845458984, | |
| "epoch": 0.259581881533101, | |
| "grad_norm": 0.5762439966201782, | |
| "kl": 0.023193359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.1785714626312256, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.2142857238650322, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 180.80358123779297, | |
| "epoch": 0.2613240418118467, | |
| "grad_norm": 1.6064391136169434, | |
| "kl": 0.064453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.1964285969734192, | |
| "reward_std": 0.13527477905154228, | |
| "rewards/accuracy_reward": 0.2142857164144516, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 140.00000762939453, | |
| "epoch": 0.26306620209059234, | |
| "grad_norm": 2.3106741905212402, | |
| "kl": 0.0313720703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.3750000596046448, | |
| "reward_std": 0.07695359364151955, | |
| "rewards/accuracy_reward": 0.392857164144516, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 137.25000381469727, | |
| "epoch": 0.26480836236933797, | |
| "grad_norm": 1.3025450706481934, | |
| "kl": 0.02471923828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.1785714626312256, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.1964285746216774, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 139.60715103149414, | |
| "epoch": 0.2665505226480836, | |
| "grad_norm": 2.006934881210327, | |
| "kl": 0.039306640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.2321429252624512, | |
| "reward_std": 0.1896214671432972, | |
| "rewards/accuracy_reward": 0.2321428656578064, | |
| "rewards/format_reward": 1.0, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 146.92857360839844, | |
| "epoch": 0.2682926829268293, | |
| "grad_norm": 2.0233981609344482, | |
| "kl": 0.0511474609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.3750000596046448, | |
| "reward_std": 0.2610500380396843, | |
| "rewards/accuracy_reward": 0.3928571715950966, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 135.76786422729492, | |
| "epoch": 0.2700348432055749, | |
| "grad_norm": 2.7513368129730225, | |
| "kl": 0.03076171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.2321429252624512, | |
| "reward_std": 0.1181928962469101, | |
| "rewards/accuracy_reward": 0.232142873108387, | |
| "rewards/format_reward": 1.0, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 135.73214721679688, | |
| "epoch": 0.27177700348432055, | |
| "grad_norm": 2.0850443840026855, | |
| "kl": 0.0506591796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.196428656578064, | |
| "reward_std": 0.1071428656578064, | |
| "rewards/accuracy_reward": 0.2142857238650322, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 153.05358123779297, | |
| "epoch": 0.2735191637630662, | |
| "grad_norm": 2.3688433170318604, | |
| "kl": 0.022216796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.6250000596046448, | |
| "reward_std": 0.14838216826319695, | |
| "rewards/accuracy_reward": 0.6428571939468384, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 205.92858123779297, | |
| "epoch": 0.27526132404181186, | |
| "grad_norm": 3.015334129333496, | |
| "kl": 0.03314208984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.2500000596046448, | |
| "reward_std": 0.26657508313655853, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 167.4107208251953, | |
| "epoch": 0.2770034843205575, | |
| "grad_norm": 1.3877893686294556, | |
| "kl": 0.035400390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.1250000596046448, | |
| "reward_std": 0.1071428619325161, | |
| "rewards/accuracy_reward": 0.1428571492433548, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 135.0357208251953, | |
| "epoch": 0.2787456445993031, | |
| "grad_norm": 2.2060704231262207, | |
| "kl": 0.030029296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.285714328289032, | |
| "reward_std": 0.24241763353347778, | |
| "rewards/accuracy_reward": 0.3035714477300644, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 185.50000762939453, | |
| "epoch": 0.2804878048780488, | |
| "grad_norm": 0.6886353492736816, | |
| "kl": 0.032958984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.1071428619325161, | |
| "rewards/accuracy_reward": 0.1785714328289032, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 84.0535774230957, | |
| "epoch": 0.28222996515679444, | |
| "grad_norm": 1.9014931917190552, | |
| "kl": 0.0433349609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.3571429252624512, | |
| "reward_std": 0.18409645557403564, | |
| "rewards/accuracy_reward": 0.3571428805589676, | |
| "rewards/format_reward": 1.0, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 95.14286041259766, | |
| "epoch": 0.28397212543554007, | |
| "grad_norm": 5.170721054077148, | |
| "kl": 0.0498046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.4821429252624512, | |
| "reward_std": 0.1071428656578064, | |
| "rewards/accuracy_reward": 0.4821428805589676, | |
| "rewards/format_reward": 1.0, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 120.8214340209961, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 4.6296796798706055, | |
| "kl": 0.035888671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.0535714626312256, | |
| "reward_std": 0.1071428656578064, | |
| "rewards/accuracy_reward": 0.0714285746216774, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 68.62500190734863, | |
| "epoch": 0.2874564459930314, | |
| "grad_norm": 2.599226236343384, | |
| "kl": 0.0472412109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.3571429252624512, | |
| "reward_std": 0.19514648616313934, | |
| "rewards/accuracy_reward": 0.3571428805589676, | |
| "rewards/format_reward": 1.0, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 114.92858123779297, | |
| "epoch": 0.289198606271777, | |
| "grad_norm": 0.9972745776176453, | |
| "kl": 0.0364990234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.285714328289032, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.3035714477300644, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 132.44643020629883, | |
| "epoch": 0.29094076655052264, | |
| "grad_norm": 3.7945616245269775, | |
| "kl": 0.03155517578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.1785714402794838, | |
| "rewards/accuracy_reward": 0.1607142984867096, | |
| "rewards/format_reward": 1.0, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 65.51786041259766, | |
| "epoch": 0.2926829268292683, | |
| "grad_norm": 2.3588600158691406, | |
| "kl": 0.0594482421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.3750000596046448, | |
| "reward_std": 0.1181928962469101, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 1.0, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 97.87500762939453, | |
| "epoch": 0.29442508710801396, | |
| "grad_norm": 1.6221623420715332, | |
| "kl": 0.0416259765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.3392857909202576, | |
| "reward_std": 0.0357142873108387, | |
| "rewards/accuracy_reward": 0.3392857313156128, | |
| "rewards/format_reward": 1.0, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 68.78571510314941, | |
| "epoch": 0.2961672473867596, | |
| "grad_norm": 3.4078965187072754, | |
| "kl": 0.197509765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0079, | |
| "reward": 1.321428656578064, | |
| "reward_std": 0.1428571492433548, | |
| "rewards/accuracy_reward": 0.321428582072258, | |
| "rewards/format_reward": 1.0, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 95.67857360839844, | |
| "epoch": 0.2979094076655052, | |
| "grad_norm": 3.5229647159576416, | |
| "kl": 0.03277587890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.2142857313156128, | |
| "reward_std": 0.1539071835577488, | |
| "rewards/accuracy_reward": 0.2142857164144516, | |
| "rewards/format_reward": 1.0, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 92.17857360839844, | |
| "epoch": 0.29965156794425085, | |
| "grad_norm": 1.6305819749832153, | |
| "kl": 0.0592041015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.25, | |
| "reward_std": 0.1539071798324585, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 1.0, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 134.91072273254395, | |
| "epoch": 0.30139372822299654, | |
| "grad_norm": 0.8862522840499878, | |
| "kl": 0.03143310546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.3928571939468384, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.392857164144516, | |
| "rewards/format_reward": 1.0, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 91.64286041259766, | |
| "epoch": 0.30313588850174217, | |
| "grad_norm": 2.307363748550415, | |
| "kl": 0.04156494140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.1785715222358704, | |
| "reward_std": 0.18409645557403564, | |
| "rewards/accuracy_reward": 0.1785714365541935, | |
| "rewards/format_reward": 1.0, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 100.71429061889648, | |
| "epoch": 0.3048780487804878, | |
| "grad_norm": 4.026115417480469, | |
| "kl": 0.03656005859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.410714328289032, | |
| "reward_std": 0.1785714402794838, | |
| "rewards/accuracy_reward": 0.4285714328289032, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 122.00000762939453, | |
| "epoch": 0.30662020905923343, | |
| "grad_norm": 1.8479260206222534, | |
| "kl": 0.04150390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.446428656578064, | |
| "reward_std": 0.1071428619325161, | |
| "rewards/accuracy_reward": 0.4464285969734192, | |
| "rewards/format_reward": 1.0, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 99.00000381469727, | |
| "epoch": 0.3083623693379791, | |
| "grad_norm": 2.3721132278442383, | |
| "kl": 0.0333251953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.3571428656578064, | |
| "reward_std": 0.11266787722706795, | |
| "rewards/accuracy_reward": 0.3571428656578064, | |
| "rewards/format_reward": 1.0, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 132.9285774230957, | |
| "epoch": 0.31010452961672474, | |
| "grad_norm": 0.8678131103515625, | |
| "kl": 0.03631591796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.3214285969734192, | |
| "reward_std": 0.18409644439816475, | |
| "rewards/accuracy_reward": 0.321428582072258, | |
| "rewards/format_reward": 1.0, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 112.21428680419922, | |
| "epoch": 0.3118466898954704, | |
| "grad_norm": 2.4448330402374268, | |
| "kl": 0.043212890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.1181928962469101, | |
| "rewards/accuracy_reward": 0.1607142984867096, | |
| "rewards/format_reward": 1.0, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 134.64286041259766, | |
| "epoch": 0.313588850174216, | |
| "grad_norm": 1.2125186920166016, | |
| "kl": 0.0335693359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.2500000596046448, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.2500000149011612, | |
| "rewards/format_reward": 1.0, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 174.6428680419922, | |
| "epoch": 0.3153310104529617, | |
| "grad_norm": 1.7917309999465942, | |
| "kl": 0.025146484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.2321429252624512, | |
| "reward_std": 0.14838216826319695, | |
| "rewards/accuracy_reward": 0.2500000149011612, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 203.71429443359375, | |
| "epoch": 0.3170731707317073, | |
| "grad_norm": 3.1097638607025146, | |
| "kl": 0.03955078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.3392857909202576, | |
| "reward_std": 0.18105553090572357, | |
| "rewards/accuracy_reward": 0.3571428805589676, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 145.0357208251953, | |
| "epoch": 0.31881533101045295, | |
| "grad_norm": 0.9235899448394775, | |
| "kl": 0.0269775390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.3035715222358704, | |
| "reward_std": 0.1785714402794838, | |
| "rewards/accuracy_reward": 0.3035714477300644, | |
| "rewards/format_reward": 1.0, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 143.55358123779297, | |
| "epoch": 0.3205574912891986, | |
| "grad_norm": 0.8848127126693726, | |
| "kl": 0.02520751953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.2500000596046448, | |
| "reward_std": 0.1428571529686451, | |
| "rewards/accuracy_reward": 0.25000000931322575, | |
| "rewards/format_reward": 1.0, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 165.62500762939453, | |
| "epoch": 0.32229965156794427, | |
| "grad_norm": 1.5377029180526733, | |
| "kl": 0.017181396484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.2321428656578064, | |
| "reward_std": 0.14838216453790665, | |
| "rewards/accuracy_reward": 0.2321428656578064, | |
| "rewards/format_reward": 1.0, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 156.3928680419922, | |
| "epoch": 0.3240418118466899, | |
| "grad_norm": 1.4587819576263428, | |
| "kl": 0.02685546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.3928571939468384, | |
| "reward_std": 0.19514650478959084, | |
| "rewards/accuracy_reward": 0.392857164144516, | |
| "rewards/format_reward": 1.0, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 216.4107208251953, | |
| "epoch": 0.32578397212543553, | |
| "grad_norm": 2.4346601963043213, | |
| "kl": 0.018524169921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.410714328289032, | |
| "reward_std": 0.14838216453790665, | |
| "rewards/accuracy_reward": 0.4107143133878708, | |
| "rewards/format_reward": 1.0, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 161.73215103149414, | |
| "epoch": 0.32752613240418116, | |
| "grad_norm": 0.7127730250358582, | |
| "kl": 0.02191162109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.2321429252624512, | |
| "reward_std": 0.1071428619325161, | |
| "rewards/accuracy_reward": 0.2321428693830967, | |
| "rewards/format_reward": 1.0, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 217.33929443359375, | |
| "epoch": 0.32926829268292684, | |
| "grad_norm": 1.9412953853607178, | |
| "kl": 0.012176513671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "reward": 1.2321429252624512, | |
| "reward_std": 0.1785714402794838, | |
| "rewards/accuracy_reward": 0.2321428656578064, | |
| "rewards/format_reward": 1.0, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 315.4464416503906, | |
| "epoch": 0.3310104529616725, | |
| "grad_norm": 0.8375619053840637, | |
| "kl": 0.01995849609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.2321429252624512, | |
| "reward_std": 0.21124479547142982, | |
| "rewards/accuracy_reward": 0.2500000149011612, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 203.0178680419922, | |
| "epoch": 0.3327526132404181, | |
| "grad_norm": 2.4890501499176025, | |
| "kl": 0.0247802734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.2857143878936768, | |
| "reward_std": 0.18409645557403564, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 1.0, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 217.75000762939453, | |
| "epoch": 0.3344947735191638, | |
| "grad_norm": 0.957855761051178, | |
| "kl": 0.014923095703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.5714285969734192, | |
| "reward_std": 0.25552502274513245, | |
| "rewards/accuracy_reward": 0.5714285969734192, | |
| "rewards/format_reward": 1.0, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 347.08929443359375, | |
| "epoch": 0.3362369337979094, | |
| "grad_norm": 0.6019410490989685, | |
| "kl": 0.0164794921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.3035714626312256, | |
| "reward_std": 0.18105553090572357, | |
| "rewards/accuracy_reward": 0.321428582072258, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 268.8214416503906, | |
| "epoch": 0.33797909407665505, | |
| "grad_norm": 0.979367733001709, | |
| "kl": 0.02947998046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.3928571939468384, | |
| "reward_std": 0.26657505333423615, | |
| "rewards/accuracy_reward": 0.4107143133878708, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 332.1785888671875, | |
| "epoch": 0.3397212543554007, | |
| "grad_norm": 1.4663631916046143, | |
| "kl": 0.0245361328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.3214285969734192, | |
| "reward_std": 0.3571428656578064, | |
| "rewards/accuracy_reward": 0.392857164144516, | |
| "rewards/format_reward": 0.9285714626312256, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 389.5357360839844, | |
| "epoch": 0.34146341463414637, | |
| "grad_norm": 0.43615275621414185, | |
| "kl": 0.016143798828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.1250000596046448, | |
| "reward_std": 0.1785714402794838, | |
| "rewards/accuracy_reward": 0.1785714402794838, | |
| "rewards/format_reward": 0.9464285969734192, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 267.7678680419922, | |
| "epoch": 0.343205574912892, | |
| "grad_norm": 2.834510564804077, | |
| "kl": 0.0343017578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.3392857313156128, | |
| "reward_std": 0.14838216826319695, | |
| "rewards/accuracy_reward": 0.3571428656578064, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 261.1607208251953, | |
| "epoch": 0.34494773519163763, | |
| "grad_norm": 1.6161222457885742, | |
| "kl": 0.0169677734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.5000000596046448, | |
| "reward_std": 0.26657505333423615, | |
| "rewards/accuracy_reward": 0.517857164144516, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 414.0357360839844, | |
| "epoch": 0.34668989547038326, | |
| "grad_norm": 1.6385655403137207, | |
| "kl": 0.0218505859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.410714328289032, | |
| "reward_std": 0.29123930633068085, | |
| "rewards/accuracy_reward": 0.464285746216774, | |
| "rewards/format_reward": 0.9464285969734192, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 352.7678680419922, | |
| "epoch": 0.34843205574912894, | |
| "grad_norm": 1.2244716882705688, | |
| "kl": 0.01873779296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.071428656578064, | |
| "reward_std": 0.24241764843463898, | |
| "rewards/accuracy_reward": 0.1250000074505806, | |
| "rewards/format_reward": 0.9464285969734192, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 327.78572845458984, | |
| "epoch": 0.3501742160278746, | |
| "grad_norm": 0.8990840315818787, | |
| "kl": 0.02777099609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.1964285969734192, | |
| "reward_std": 0.23086076974868774, | |
| "rewards/accuracy_reward": 0.2500000149011612, | |
| "rewards/format_reward": 0.9464285969734192, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 249.0535888671875, | |
| "epoch": 0.3519163763066202, | |
| "grad_norm": 0.8288552165031433, | |
| "kl": 0.026885986328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.3035715222358704, | |
| "reward_std": 0.1071428619325161, | |
| "rewards/accuracy_reward": 0.3035714477300644, | |
| "rewards/format_reward": 1.0, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 291.83929443359375, | |
| "epoch": 0.35365853658536583, | |
| "grad_norm": 0.9214010238647461, | |
| "kl": 0.0103759765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.3750000596046448, | |
| "reward_std": 0.2891819700598717, | |
| "rewards/accuracy_reward": 0.3928571492433548, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 197.3928680419922, | |
| "epoch": 0.3554006968641115, | |
| "grad_norm": 1.698442816734314, | |
| "kl": 0.0186767578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.321428656578064, | |
| "reward_std": 0.1539071798324585, | |
| "rewards/accuracy_reward": 0.3392857313156128, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 191.39286041259766, | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 1.9810231924057007, | |
| "kl": 0.0213623046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.4464285969734192, | |
| "reward_std": 0.23086077719926834, | |
| "rewards/accuracy_reward": 0.464285746216774, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 208.1071548461914, | |
| "epoch": 0.3588850174216028, | |
| "grad_norm": 1.2316064834594727, | |
| "kl": 0.0518798828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.0892857909202576, | |
| "reward_std": 0.14838216826319695, | |
| "rewards/accuracy_reward": 0.08928571827709675, | |
| "rewards/format_reward": 1.0, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 213.3571548461914, | |
| "epoch": 0.3606271777003484, | |
| "grad_norm": 0.23318666219711304, | |
| "kl": 0.02264404296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.2500000596046448, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.2678571492433548, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 224.3571548461914, | |
| "epoch": 0.3623693379790941, | |
| "grad_norm": 1.5259240865707397, | |
| "kl": 0.01904296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.3571429252624512, | |
| "reward_std": 0.11266788095235825, | |
| "rewards/accuracy_reward": 0.3571428656578064, | |
| "rewards/format_reward": 1.0, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 217.94644165039062, | |
| "epoch": 0.3641114982578397, | |
| "grad_norm": 1.5131481885910034, | |
| "kl": 0.017578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.3214285969734192, | |
| "reward_std": 0.1428571529686451, | |
| "rewards/accuracy_reward": 0.3392857313156128, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 206.46429443359375, | |
| "epoch": 0.36585365853658536, | |
| "grad_norm": 0.8935233950614929, | |
| "kl": 0.025146484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.1785715222358704, | |
| "reward_std": 0.19514649361371994, | |
| "rewards/accuracy_reward": 0.196428582072258, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 150.60714721679688, | |
| "epoch": 0.367595818815331, | |
| "grad_norm": 1.949971318244934, | |
| "kl": 0.108154296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0043, | |
| "reward": 1.2857143878936768, | |
| "reward_std": 0.18409645557403564, | |
| "rewards/accuracy_reward": 0.3035714477300644, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 233.19644165039062, | |
| "epoch": 0.3693379790940767, | |
| "grad_norm": 1.5521166324615479, | |
| "kl": 0.0205078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.2678571939468384, | |
| "reward_std": 0.14838216826319695, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 115.98214721679688, | |
| "epoch": 0.3710801393728223, | |
| "grad_norm": 2.789050579071045, | |
| "kl": 0.0338134765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.3035715222358704, | |
| "reward_std": 0.1071428656578064, | |
| "rewards/accuracy_reward": 0.3035714477300644, | |
| "rewards/format_reward": 1.0, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 148.96429061889648, | |
| "epoch": 0.37282229965156793, | |
| "grad_norm": 3.145935297012329, | |
| "kl": 0.02362060546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.2678572535514832, | |
| "reward_std": 0.21981074661016464, | |
| "rewards/accuracy_reward": 0.3035714328289032, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 129.25000762939453, | |
| "epoch": 0.37456445993031356, | |
| "grad_norm": 4.017449378967285, | |
| "kl": 0.0260009765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.3392857909202576, | |
| "reward_std": 0.1785714402794838, | |
| "rewards/accuracy_reward": 0.3392857313156128, | |
| "rewards/format_reward": 1.0, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 204.2678680419922, | |
| "epoch": 0.37630662020905925, | |
| "grad_norm": 0.5732056498527527, | |
| "kl": 0.0830078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.2857143878936768, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 1.0, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 181.7678680419922, | |
| "epoch": 0.3780487804878049, | |
| "grad_norm": 0.7782284617424011, | |
| "kl": 0.068359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.3035714626312256, | |
| "reward_std": 0.07695358991622925, | |
| "rewards/accuracy_reward": 0.3214285746216774, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 142.875, | |
| "epoch": 0.3797909407665505, | |
| "grad_norm": 1.5171810388565063, | |
| "kl": 0.0218505859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.2678571939468384, | |
| "reward_std": 0.07695359364151955, | |
| "rewards/accuracy_reward": 0.267857164144516, | |
| "rewards/format_reward": 1.0, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 174.3214340209961, | |
| "epoch": 0.38153310104529614, | |
| "grad_norm": 1.3412084579467773, | |
| "kl": 0.13818359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0055, | |
| "reward": 1.3392857313156128, | |
| "reward_std": 0.14838216826319695, | |
| "rewards/accuracy_reward": 0.3571428656578064, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 166.33929443359375, | |
| "epoch": 0.3832752613240418, | |
| "grad_norm": 2.023908853530884, | |
| "kl": 0.038818359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.2500000596046448, | |
| "reward_std": 0.18409645557403564, | |
| "rewards/accuracy_reward": 0.267857164144516, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 104.26786422729492, | |
| "epoch": 0.38501742160278746, | |
| "grad_norm": 0.8557510375976562, | |
| "kl": 0.02642822265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.5000000596046448, | |
| "reward_std": 0.11266788095235825, | |
| "rewards/accuracy_reward": 0.5000000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 116.12500762939453, | |
| "epoch": 0.3867595818815331, | |
| "grad_norm": 2.272390365600586, | |
| "kl": 0.0286865234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.321428656578064, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.3214285746216774, | |
| "rewards/format_reward": 1.0, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 134.21429443359375, | |
| "epoch": 0.3885017421602787, | |
| "grad_norm": 2.2785820960998535, | |
| "kl": 0.05206298828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.2321429252624512, | |
| "reward_std": 0.1181928999722004, | |
| "rewards/accuracy_reward": 0.2321428656578064, | |
| "rewards/format_reward": 1.0, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 103.01786041259766, | |
| "epoch": 0.3902439024390244, | |
| "grad_norm": 1.0930382013320923, | |
| "kl": 0.022918701171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.4821429252624512, | |
| "reward_std": 0.14838216453790665, | |
| "rewards/accuracy_reward": 0.4821428805589676, | |
| "rewards/format_reward": 1.0, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 164.50000762939453, | |
| "epoch": 0.39198606271777003, | |
| "grad_norm": 1.6921836137771606, | |
| "kl": 0.02490234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.2500000596046448, | |
| "reward_std": 0.18409644439816475, | |
| "rewards/accuracy_reward": 0.2500000149011612, | |
| "rewards/format_reward": 1.0, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 112.5535774230957, | |
| "epoch": 0.39372822299651566, | |
| "grad_norm": 2.7419986724853516, | |
| "kl": 0.04638671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.3035714626312256, | |
| "reward_std": 0.1896214671432972, | |
| "rewards/accuracy_reward": 0.3035714477300644, | |
| "rewards/format_reward": 1.0, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 127.62500381469727, | |
| "epoch": 0.39547038327526135, | |
| "grad_norm": 6.331331253051758, | |
| "kl": 0.03118896484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.285714328289032, | |
| "reward_std": 0.18409645557403564, | |
| "rewards/accuracy_reward": 0.3035714477300644, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 172.58929443359375, | |
| "epoch": 0.397212543554007, | |
| "grad_norm": 2.7327375411987305, | |
| "kl": 0.03802490234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.3928571939468384, | |
| "reward_std": 0.2253357470035553, | |
| "rewards/accuracy_reward": 0.4107142984867096, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 151.58929443359375, | |
| "epoch": 0.3989547038327526, | |
| "grad_norm": 0.866677463054657, | |
| "kl": 0.01922607421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.5178572535514832, | |
| "reward_std": 0.1181928962469101, | |
| "rewards/accuracy_reward": 0.5178571790456772, | |
| "rewards/format_reward": 1.0, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 175.2857208251953, | |
| "epoch": 0.40069686411149824, | |
| "grad_norm": 1.3600616455078125, | |
| "kl": 0.024658203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.2142857313156128, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.23214287497103214, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 107.00000762939453, | |
| "epoch": 0.4024390243902439, | |
| "grad_norm": 3.2532806396484375, | |
| "kl": 0.04315185546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.5178571939468384, | |
| "reward_std": 0.4149572402238846, | |
| "rewards/accuracy_reward": 0.5357143133878708, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 183.3928680419922, | |
| "epoch": 0.40418118466898956, | |
| "grad_norm": 2.559502601623535, | |
| "kl": 0.03631591796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.321428656578064, | |
| "reward_std": 0.25552502274513245, | |
| "rewards/accuracy_reward": 0.3392857201397419, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 110.0714340209961, | |
| "epoch": 0.4059233449477352, | |
| "grad_norm": 2.014815330505371, | |
| "kl": 0.035888671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.410714328289032, | |
| "reward_std": 0.07695359364151955, | |
| "rewards/accuracy_reward": 0.4107142984867096, | |
| "rewards/format_reward": 1.0, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 164.33929443359375, | |
| "epoch": 0.4076655052264808, | |
| "grad_norm": 1.9719363451004028, | |
| "kl": 0.0830078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.2678571939468384, | |
| "reward_std": 0.2610500529408455, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 167.98214721679688, | |
| "epoch": 0.4094076655052265, | |
| "grad_norm": 1.2324802875518799, | |
| "kl": 0.04840087890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.3750000596046448, | |
| "reward_std": 0.1071428619325161, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 1.0, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 130.76786041259766, | |
| "epoch": 0.41114982578397213, | |
| "grad_norm": 2.3901517391204834, | |
| "kl": 0.0355224609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.2142857313156128, | |
| "reward_std": 0.1428571529686451, | |
| "rewards/accuracy_reward": 0.2142857164144516, | |
| "rewards/format_reward": 1.0, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 82.35714530944824, | |
| "epoch": 0.41289198606271776, | |
| "grad_norm": 1.5934151411056519, | |
| "kl": 0.06884765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.0357142873108387, | |
| "rewards/accuracy_reward": 0.160714291036129, | |
| "rewards/format_reward": 1.0, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 101.0535774230957, | |
| "epoch": 0.4146341463414634, | |
| "grad_norm": 1.6057491302490234, | |
| "kl": 0.081787109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.1428571939468384, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.1428571492433548, | |
| "rewards/format_reward": 1.0, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 134.33929443359375, | |
| "epoch": 0.4163763066202091, | |
| "grad_norm": 11.595154762268066, | |
| "kl": 0.03668212890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.3392857909202576, | |
| "reward_std": 0.16546405106782913, | |
| "rewards/accuracy_reward": 0.3750000298023224, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 116.76786422729492, | |
| "epoch": 0.4181184668989547, | |
| "grad_norm": 0.3346155881881714, | |
| "kl": 0.034912109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.0892857909202576, | |
| "reward_std": 0.0357142873108387, | |
| "rewards/accuracy_reward": 0.08928571827709675, | |
| "rewards/format_reward": 1.0, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 99.01786041259766, | |
| "epoch": 0.41986062717770034, | |
| "grad_norm": 1.3594080209732056, | |
| "kl": 0.0355224609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.3035714626312256, | |
| "reward_std": 0.1071428656578064, | |
| "rewards/accuracy_reward": 0.3035714328289032, | |
| "rewards/format_reward": 1.0, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 118.0714340209961, | |
| "epoch": 0.42160278745644597, | |
| "grad_norm": 1.0064547061920166, | |
| "kl": 0.0494384765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.3392857909202576, | |
| "reward_std": 0.07695359364151955, | |
| "rewards/accuracy_reward": 0.3571428805589676, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 72.0714340209961, | |
| "epoch": 0.42334494773519166, | |
| "grad_norm": 0.7986550331115723, | |
| "kl": 0.0435791015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.3214285969734192, | |
| "reward_std": 0.04123930633068085, | |
| "rewards/accuracy_reward": 0.3214285969734192, | |
| "rewards/format_reward": 1.0, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 109.64286041259766, | |
| "epoch": 0.4250871080139373, | |
| "grad_norm": 1.521716833114624, | |
| "kl": 0.06268310546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.3750000596046448, | |
| "reward_std": 0.1071428619325161, | |
| "rewards/accuracy_reward": 0.3750000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 64.33928871154785, | |
| "epoch": 0.4268292682926829, | |
| "grad_norm": 1.0075122117996216, | |
| "kl": 0.037353515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.4285715222358704, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.4285714477300644, | |
| "rewards/format_reward": 1.0, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 134.35715103149414, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 1.5240962505340576, | |
| "kl": 0.03082275390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.2321428656578064, | |
| "reward_std": 0.1181928962469101, | |
| "rewards/accuracy_reward": 0.2321428656578064, | |
| "rewards/format_reward": 1.0, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 201.75000762939453, | |
| "epoch": 0.43031358885017423, | |
| "grad_norm": 0.5076520442962646, | |
| "kl": 0.04052734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.2500000596046448, | |
| "reward_std": 0.12974976375699043, | |
| "rewards/accuracy_reward": 0.2857142984867096, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 96.41071701049805, | |
| "epoch": 0.43205574912891986, | |
| "grad_norm": 2.3108022212982178, | |
| "kl": 0.0411376953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.5000001192092896, | |
| "reward_std": 0.11266788095235825, | |
| "rewards/accuracy_reward": 0.5000000149011612, | |
| "rewards/format_reward": 1.0, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 106.32143020629883, | |
| "epoch": 0.4337979094076655, | |
| "grad_norm": 2.105095386505127, | |
| "kl": 0.03216552734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.3392857313156128, | |
| "reward_std": 0.14838216453790665, | |
| "rewards/accuracy_reward": 0.3392857313156128, | |
| "rewards/format_reward": 1.0, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 74.57143020629883, | |
| "epoch": 0.4355400696864111, | |
| "grad_norm": 2.9153895378112793, | |
| "kl": 0.04345703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.4642857909202576, | |
| "reward_std": 0.1539071872830391, | |
| "rewards/accuracy_reward": 0.4642857313156128, | |
| "rewards/format_reward": 1.0, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 199.92858123779297, | |
| "epoch": 0.4372822299651568, | |
| "grad_norm": 2.763808488845825, | |
| "kl": 0.0286865234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.2321429252624512, | |
| "reward_std": 0.3324786275625229, | |
| "rewards/accuracy_reward": 0.2857142873108387, | |
| "rewards/format_reward": 0.9464285969734192, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 68.98214721679688, | |
| "epoch": 0.43902439024390244, | |
| "grad_norm": 1.7675226926803589, | |
| "kl": 0.0506591796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.5178571939468384, | |
| "reward_std": 0.1071428656578064, | |
| "rewards/accuracy_reward": 0.517857164144516, | |
| "rewards/format_reward": 1.0, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 66.26785945892334, | |
| "epoch": 0.44076655052264807, | |
| "grad_norm": 2.443754196166992, | |
| "kl": 0.03668212890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.4821429252624512, | |
| "reward_std": 0.29123930633068085, | |
| "rewards/accuracy_reward": 0.4821428954601288, | |
| "rewards/format_reward": 1.0, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 216.76786041259766, | |
| "epoch": 0.4425087108013937, | |
| "grad_norm": 0.465847373008728, | |
| "kl": 0.03338623046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.3928572535514832, | |
| "reward_std": 0.20117833465337753, | |
| "rewards/accuracy_reward": 0.4464285969734192, | |
| "rewards/format_reward": 0.9464285969734192, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 107.25000381469727, | |
| "epoch": 0.4442508710801394, | |
| "grad_norm": 1.1041370630264282, | |
| "kl": 0.0469970703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.4821429252624512, | |
| "reward_std": 0.1896214708685875, | |
| "rewards/accuracy_reward": 0.5000000298023224, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 134.30358123779297, | |
| "epoch": 0.445993031358885, | |
| "grad_norm": 1.6369932889938354, | |
| "kl": 0.0478515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.3571429252624512, | |
| "reward_std": 0.1539071798324585, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 127.5535774230957, | |
| "epoch": 0.44773519163763065, | |
| "grad_norm": 0.9092658162117004, | |
| "kl": 0.0428466796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.2678571939468384, | |
| "reward_std": 0.0357142873108387, | |
| "rewards/accuracy_reward": 0.267857164144516, | |
| "rewards/format_reward": 1.0, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 132.4107208251953, | |
| "epoch": 0.44947735191637633, | |
| "grad_norm": 1.8821191787719727, | |
| "kl": 0.042724609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.3750000596046448, | |
| "reward_std": 0.1785714402794838, | |
| "rewards/accuracy_reward": 0.392857164144516, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 197.5714340209961, | |
| "epoch": 0.45121951219512196, | |
| "grad_norm": 2.47094464302063, | |
| "kl": 0.02911376953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.2142857909202576, | |
| "reward_std": 0.2253357619047165, | |
| "rewards/accuracy_reward": 0.2321428693830967, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 134.96429443359375, | |
| "epoch": 0.4529616724738676, | |
| "grad_norm": 1.3298813104629517, | |
| "kl": 0.0323486328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.3750000596046448, | |
| "reward_std": 0.1181928962469101, | |
| "rewards/accuracy_reward": 0.3750000298023224, | |
| "rewards/format_reward": 1.0, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 104.3035774230957, | |
| "epoch": 0.4547038327526132, | |
| "grad_norm": 2.41137957572937, | |
| "kl": 0.044921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.071428656578064, | |
| "reward_std": 0.11266787722706795, | |
| "rewards/accuracy_reward": 0.07142857648432255, | |
| "rewards/format_reward": 1.0, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 111.57143783569336, | |
| "epoch": 0.4564459930313589, | |
| "grad_norm": 2.6853108406066895, | |
| "kl": 0.022705078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.3571429252624512, | |
| "reward_std": 0.1428571492433548, | |
| "rewards/accuracy_reward": 0.3571428805589676, | |
| "rewards/format_reward": 1.0, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 93.35714721679688, | |
| "epoch": 0.45818815331010454, | |
| "grad_norm": 2.3133461475372314, | |
| "kl": 0.0418701171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.5178571939468384, | |
| "reward_std": 0.2610500380396843, | |
| "rewards/accuracy_reward": 0.5357142984867096, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 168.1607208251953, | |
| "epoch": 0.45993031358885017, | |
| "grad_norm": 2.1456406116485596, | |
| "kl": 0.03265380859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.285714328289032, | |
| "reward_std": 0.2253357619047165, | |
| "rewards/accuracy_reward": 0.2857143059372902, | |
| "rewards/format_reward": 1.0, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 117.03571701049805, | |
| "epoch": 0.4616724738675958, | |
| "grad_norm": 2.640500545501709, | |
| "kl": 0.03973388671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.410714328289032, | |
| "reward_std": 0.14838216453790665, | |
| "rewards/accuracy_reward": 0.4107142984867096, | |
| "rewards/format_reward": 1.0, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 206.94644165039062, | |
| "epoch": 0.4634146341463415, | |
| "grad_norm": 0.7177957892417908, | |
| "kl": 0.02777099609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.1250000596046448, | |
| "reward_std": 0.1071428656578064, | |
| "rewards/accuracy_reward": 0.1428571492433548, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 181.1607208251953, | |
| "epoch": 0.4651567944250871, | |
| "grad_norm": 1.6306074857711792, | |
| "kl": 0.03717041015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.29123931378126144, | |
| "rewards/accuracy_reward": 0.1785714402794838, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 165.6428680419922, | |
| "epoch": 0.46689895470383275, | |
| "grad_norm": 1.0887411832809448, | |
| "kl": 0.02734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.5357143878936768, | |
| "reward_std": 0.17553050816059113, | |
| "rewards/accuracy_reward": 0.5535714477300644, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 172.42858123779297, | |
| "epoch": 0.4686411149825784, | |
| "grad_norm": 0.8613088130950928, | |
| "kl": 0.03253173828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.07695359364151955, | |
| "rewards/accuracy_reward": 0.1785714328289032, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 235.9821548461914, | |
| "epoch": 0.47038327526132406, | |
| "grad_norm": 1.2009068727493286, | |
| "kl": 0.0572509765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.1250000596046448, | |
| "reward_std": 0.21981072798371315, | |
| "rewards/accuracy_reward": 0.14285714365541935, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 155.21428680419922, | |
| "epoch": 0.4721254355400697, | |
| "grad_norm": 0.7742925882339478, | |
| "kl": 0.0279541015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.3214285969734192, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.321428582072258, | |
| "rewards/format_reward": 1.0, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 179.2678680419922, | |
| "epoch": 0.4738675958188153, | |
| "grad_norm": 1.1722646951675415, | |
| "kl": 0.02838134765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.4285715222358704, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.4285714477300644, | |
| "rewards/format_reward": 1.0, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 201.83929443359375, | |
| "epoch": 0.47560975609756095, | |
| "grad_norm": 1.630911946296692, | |
| "kl": 0.0638427734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.3035714626312256, | |
| "reward_std": 0.07695359364151955, | |
| "rewards/accuracy_reward": 0.321428582072258, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 126.08929061889648, | |
| "epoch": 0.47735191637630664, | |
| "grad_norm": 2.8433239459991455, | |
| "kl": 0.032958984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.3928572535514832, | |
| "reward_std": 0.18409645557403564, | |
| "rewards/accuracy_reward": 0.3928571492433548, | |
| "rewards/format_reward": 1.0, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 146.4464340209961, | |
| "epoch": 0.47909407665505227, | |
| "grad_norm": 2.8697118759155273, | |
| "kl": 0.02886962890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.4285714626312256, | |
| "reward_std": 0.11266787722706795, | |
| "rewards/accuracy_reward": 0.4285714328289032, | |
| "rewards/format_reward": 1.0, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 235.2321548461914, | |
| "epoch": 0.4808362369337979, | |
| "grad_norm": 1.3921054601669312, | |
| "kl": 0.0537109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.2142857909202576, | |
| "reward_std": 0.1428571529686451, | |
| "rewards/accuracy_reward": 0.2678571492433548, | |
| "rewards/format_reward": 0.9464285969734192, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 173.71429443359375, | |
| "epoch": 0.48257839721254353, | |
| "grad_norm": 1.9831897020339966, | |
| "kl": 0.093994140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0037, | |
| "reward": 1.3214285969734192, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.3214285969734192, | |
| "rewards/format_reward": 1.0, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 191.69644165039062, | |
| "epoch": 0.4843205574912892, | |
| "grad_norm": 1.5223335027694702, | |
| "kl": 0.03350830078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.3035714626312256, | |
| "reward_std": 0.21981073915958405, | |
| "rewards/accuracy_reward": 0.321428582072258, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 170.60714721679688, | |
| "epoch": 0.48606271777003485, | |
| "grad_norm": 0.9542393088340759, | |
| "kl": 0.03057861328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.446428656578064, | |
| "reward_std": 0.1071428619325161, | |
| "rewards/accuracy_reward": 0.4642857313156128, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 236.96428680419922, | |
| "epoch": 0.4878048780487805, | |
| "grad_norm": 1.6516475677490234, | |
| "kl": 0.0150146484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.3392857313156128, | |
| "reward_std": 0.07695358991622925, | |
| "rewards/accuracy_reward": 0.3392857313156128, | |
| "rewards/format_reward": 1.0, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 201.25000762939453, | |
| "epoch": 0.4895470383275261, | |
| "grad_norm": 1.9830681085586548, | |
| "kl": 0.02423095703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.3392857909202576, | |
| "reward_std": 0.1785714402794838, | |
| "rewards/accuracy_reward": 0.3571428805589676, | |
| "rewards/format_reward": 0.9821428656578064, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 186.05358123779297, | |
| "epoch": 0.4912891986062718, | |
| "grad_norm": 3.112156629562378, | |
| "kl": 0.02825927734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.3571429252624512, | |
| "reward_std": 0.26657506078481674, | |
| "rewards/accuracy_reward": 0.3571428805589676, | |
| "rewards/format_reward": 1.0, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 149.62500762939453, | |
| "epoch": 0.4930313588850174, | |
| "grad_norm": 0.9592561721801758, | |
| "kl": 0.03778076171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.3571429252624512, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.357142873108387, | |
| "rewards/format_reward": 1.0, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 213.50000762939453, | |
| "epoch": 0.49477351916376305, | |
| "grad_norm": 0.42550015449523926, | |
| "kl": 0.02685546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.321428656578064, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.357142873108387, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 197.5357208251953, | |
| "epoch": 0.4965156794425087, | |
| "grad_norm": 0.991173505783081, | |
| "kl": 0.0289306640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.3928571939468384, | |
| "reward_std": 0.0714285746216774, | |
| "rewards/accuracy_reward": 0.392857164144516, | |
| "rewards/format_reward": 1.0, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 133.76786422729492, | |
| "epoch": 0.49825783972125437, | |
| "grad_norm": 1.0027332305908203, | |
| "kl": 0.0306396484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.160714328289032, | |
| "reward_std": 0.1071428619325161, | |
| "rewards/accuracy_reward": 0.160714291036129, | |
| "rewards/format_reward": 1.0, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 90.62500762939453, | |
| "epoch": 0.5, | |
| "grad_norm": 2.177978277206421, | |
| "kl": 0.05548095703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.1607143878936768, | |
| "reward_std": 0.14838216826319695, | |
| "rewards/accuracy_reward": 0.16071429289877415, | |
| "rewards/format_reward": 1.0, | |
| "step": 287 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 287, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |