{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997324056729997, "eval_steps": 500, "global_step": 934, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 116.0, "epoch": 0.0010703773080010704, "grad_norm": 0.31743481755256653, "kl": 0.0, "learning_rate": 1.997858672376874e-05, "loss": 0.0, "reward": 0.71875, "reward_std": 0.39774754643440247, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.28125, "step": 1 }, { "completion_length": 133.125, "epoch": 0.0021407546160021407, "grad_norm": 0.30792707204818726, "kl": 0.00024101027156575583, "learning_rate": 1.9957173447537473e-05, "loss": 0.0, "reward": 0.71875, "reward_std": 0.39774755761027336, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.28125, "step": 2 }, { "completion_length": 122.3125, "epoch": 0.003211131924003211, "grad_norm": 0.3811380863189697, "kl": 0.00022512302530230954, "learning_rate": 1.993576017130621e-05, "loss": 0.0, "reward": 0.65625, "reward_std": 0.48613589629530907, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.21875, "step": 3 }, { "completion_length": 130.75, "epoch": 0.004281509232004281, "grad_norm": 0.3755797743797302, "kl": 0.00019449955652817152, "learning_rate": 1.9914346895074948e-05, "loss": 0.0, "reward": 0.40625, "reward_std": 0.574524249881506, "rewards/correctness_reward_func_qa": 0.25, "rewards/format_reward_func_qa": 0.15625, "step": 4 }, { "completion_length": 137.0, "epoch": 0.005351886540005352, "grad_norm": 0.385049045085907, "kl": 0.00024964216208900325, "learning_rate": 1.9892933618843685e-05, "loss": 0.0, "reward": 0.53125, "reward_std": 0.7513009235262871, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.15625, "step": 5 }, { "completion_length": 141.9375, "epoch": 0.006422263848006422, "grad_norm": 0.3133525848388672, "kl": 0.0002005682654271368, "learning_rate": 1.987152034261242e-05, "loss": 0.0, "reward": 0.5, "reward_std": 0.3535533770918846, "rewards/correctness_reward_func_qa": 0.25, "rewards/format_reward_func_qa": 0.25, "step": 6 }, { "completion_length": 126.6875, "epoch": 0.007492641156007493, "grad_norm": 0.3482169508934021, "kl": 0.00034537997271399945, "learning_rate": 1.985010706638116e-05, "loss": 0.0, "reward": 0.59375, "reward_std": 0.39774756133556366, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.21875, "step": 7 }, { "completion_length": 122.8125, "epoch": 0.008563018464008563, "grad_norm": 0.29682475328445435, "kl": 0.00024446644965792075, "learning_rate": 1.9828693790149897e-05, "loss": 0.0, "reward": 0.65625, "reward_std": 0.39774755761027336, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.21875, "step": 8 }, { "completion_length": 114.4375, "epoch": 0.009633395772009633, "grad_norm": 0.35831010341644287, "kl": 0.00017703452977002598, "learning_rate": 1.980728051391863e-05, "loss": 0.0, "reward": 0.625, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.25, "step": 9 }, { "completion_length": 129.5, "epoch": 0.010703773080010704, "grad_norm": 0.31033065915107727, "kl": 0.00020307257364038378, "learning_rate": 1.9785867237687368e-05, "loss": 0.0, "reward": 0.59375, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.15625, "step": 10 }, { "completion_length": 141.375, "epoch": 0.011774150388011774, "grad_norm": 0.27584683895111084, "kl": 0.00029683535103686154, "learning_rate": 1.9764453961456106e-05, "loss": 0.0, "reward": 1.0, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 11 }, { "completion_length": 126.0, "epoch": 0.012844527696012844, "grad_norm": 0.3699285686016083, "kl": 0.0002581803892098833, "learning_rate": 1.9743040685224843e-05, "loss": 0.0, "reward": 0.6875, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.25, "step": 12 }, { "completion_length": 112.0, "epoch": 0.013914905004013914, "grad_norm": 0.38912978768348694, "kl": 0.000297748927550856, "learning_rate": 1.9721627408993577e-05, "loss": 0.0, "reward": 0.5625, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.25, "step": 13 }, { "completion_length": 137.4375, "epoch": 0.014985282312014986, "grad_norm": 0.46140241622924805, "kl": 0.00031706302252132446, "learning_rate": 1.9700214132762314e-05, "loss": 0.0, "reward": 0.53125, "reward_std": 0.39774755761027336, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.21875, "step": 14 }, { "completion_length": 134.875, "epoch": 0.016055659620016056, "grad_norm": 0.3349648416042328, "kl": 0.00032720699527999386, "learning_rate": 1.967880085653105e-05, "loss": 0.0, "reward": 0.34375, "reward_std": 0.48613589257001877, "rewards/correctness_reward_func_qa": 0.1875, "rewards/format_reward_func_qa": 0.15625, "step": 15 }, { "completion_length": 150.0, "epoch": 0.017126036928017126, "grad_norm": 0.2391258031129837, "kl": 0.0003202417938155122, "learning_rate": 1.9657387580299786e-05, "loss": 0.0, "reward": 0.78125, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.28125, "step": 16 }, { "completion_length": 138.125, "epoch": 0.018196414236018196, "grad_norm": 0.3458104729652405, "kl": 0.0003185477544320747, "learning_rate": 1.9635974304068523e-05, "loss": 0.0, "reward": 0.59375, "reward_std": 0.39774755015969276, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.21875, "step": 17 }, { "completion_length": 124.625, "epoch": 0.019266791544019266, "grad_norm": 0.3071163296699524, "kl": 0.0003691624369821511, "learning_rate": 1.961456102783726e-05, "loss": 0.0, "reward": 0.96875, "reward_std": 0.39774755761027336, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.28125, "step": 18 }, { "completion_length": 142.25, "epoch": 0.020337168852020335, "grad_norm": 0.3743300139904022, "kl": 0.0003291548855486326, "learning_rate": 1.9593147751605998e-05, "loss": 0.0, "reward": 0.59375, "reward_std": 0.39774755015969276, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.15625, "step": 19 }, { "completion_length": 143.75, "epoch": 0.02140754616002141, "grad_norm": 0.30565181374549866, "kl": 0.0003879353607771918, "learning_rate": 1.957173447537473e-05, "loss": 0.0, "reward": 1.03125, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.34375, "step": 20 }, { "completion_length": 128.625, "epoch": 0.02247792346802248, "grad_norm": 0.3761177957057953, "kl": 0.000488572237372864, "learning_rate": 1.9550321199143472e-05, "loss": 0.0, "reward": 0.875, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.3125, "step": 21 }, { "completion_length": 120.875, "epoch": 0.02354830077602355, "grad_norm": 0.3983606696128845, "kl": 0.0004765185876749456, "learning_rate": 1.952890792291221e-05, "loss": 0.0, "reward": 0.375, "reward_std": 0.4419417195022106, "rewards/correctness_reward_func_qa": 0.1875, "rewards/format_reward_func_qa": 0.1875, "step": 22 }, { "completion_length": 141.375, "epoch": 0.02461867808402462, "grad_norm": 0.17304256558418274, "kl": 0.00046406158071476966, "learning_rate": 1.9507494646680944e-05, "loss": 0.0, "reward": 0.34375, "reward_std": 0.22097086906433105, "rewards/correctness_reward_func_qa": 0.125, "rewards/format_reward_func_qa": 0.21875, "step": 23 }, { "completion_length": 141.9375, "epoch": 0.02568905539202569, "grad_norm": 0.21301382780075073, "kl": 0.00047756738058524206, "learning_rate": 1.948608137044968e-05, "loss": 0.0, "reward": 0.65625, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.21875, "step": 24 }, { "completion_length": 138.6875, "epoch": 0.02675943270002676, "grad_norm": 0.12377602607011795, "kl": 0.0006441890946007334, "learning_rate": 1.9464668094218418e-05, "loss": 0.0, "reward": 0.65625, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.21875, "step": 25 }, { "completion_length": 139.0, "epoch": 0.02782981000802783, "grad_norm": 0.38574329018592834, "kl": 0.0005253907584119588, "learning_rate": 1.9443254817987152e-05, "loss": 0.0, "reward": 0.65625, "reward_std": 0.48613590002059937, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.21875, "step": 26 }, { "completion_length": 117.0625, "epoch": 0.028900187316028902, "grad_norm": 0.4199778735637665, "kl": 0.0006764229183318093, "learning_rate": 1.942184154175589e-05, "loss": 0.0, "reward": 0.65625, "reward_std": 0.30935921519994736, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.28125, "step": 27 }, { "completion_length": 127.5625, "epoch": 0.02997056462402997, "grad_norm": 0.5563454031944275, "kl": 0.0006304629205260426, "learning_rate": 1.9400428265524627e-05, "loss": 0.0, "reward": 0.84375, "reward_std": 0.574524249881506, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 28 }, { "completion_length": 132.1875, "epoch": 0.03104094193203104, "grad_norm": 0.4709160327911377, "kl": 0.0006892134988447651, "learning_rate": 1.9379014989293364e-05, "loss": 0.0, "reward": 0.59375, "reward_std": 0.39774755015969276, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.28125, "step": 29 }, { "completion_length": 136.5625, "epoch": 0.03211131924003211, "grad_norm": 0.4107370972633362, "kl": 0.0008823904645396397, "learning_rate": 1.9357601713062098e-05, "loss": 0.0, "reward": 0.75, "reward_std": 0.4419417232275009, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.25, "step": 30 }, { "completion_length": 138.75, "epoch": 0.033181696548033185, "grad_norm": 0.2810315489768982, "kl": 0.000982907113211695, "learning_rate": 1.9336188436830836e-05, "loss": 0.0, "reward": 0.46875, "reward_std": 0.39774756133556366, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.15625, "step": 31 }, { "completion_length": 139.4375, "epoch": 0.03425207385603425, "grad_norm": 0.3647318482398987, "kl": 0.0011869634690810926, "learning_rate": 1.9314775160599573e-05, "loss": 0.0, "reward": 0.71875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.21875, "step": 32 }, { "completion_length": 144.875, "epoch": 0.035322451164035325, "grad_norm": 0.3598644435405731, "kl": 0.00116271227307152, "learning_rate": 1.929336188436831e-05, "loss": 0.0, "reward": 0.65625, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.34375, "step": 33 }, { "completion_length": 145.375, "epoch": 0.03639282847203639, "grad_norm": 0.44565892219543457, "kl": 0.001034990549669601, "learning_rate": 1.9271948608137044e-05, "loss": 0.0, "reward": 0.71875, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.34375, "step": 34 }, { "completion_length": 143.25, "epoch": 0.037463205780037465, "grad_norm": 0.21219965815544128, "kl": 0.001081713242456317, "learning_rate": 1.9250535331905785e-05, "loss": 0.0, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 35 }, { "completion_length": 140.0625, "epoch": 0.03853358308803853, "grad_norm": 0.45747244358062744, "kl": 0.0011074868816649541, "learning_rate": 1.9229122055674522e-05, "loss": 0.0, "reward": 0.625, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.25, "step": 36 }, { "completion_length": 148.5, "epoch": 0.039603960396039604, "grad_norm": 0.2310914397239685, "kl": 0.0011647726641967893, "learning_rate": 1.9207708779443256e-05, "loss": 0.0, "reward": 1.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.375, "step": 37 }, { "completion_length": 144.125, "epoch": 0.04067433770404067, "grad_norm": 0.2626703977584839, "kl": 0.0011779300984926522, "learning_rate": 1.9186295503211994e-05, "loss": 0.0, "reward": 0.34375, "reward_std": 0.30935921519994736, "rewards/correctness_reward_func_qa": 0.125, "rewards/format_reward_func_qa": 0.21875, "step": 38 }, { "completion_length": 139.875, "epoch": 0.041744715012041744, "grad_norm": 0.2537483870983124, "kl": 0.001436288992408663, "learning_rate": 1.916488222698073e-05, "loss": 0.0001, "reward": 0.78125, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.28125, "step": 39 }, { "completion_length": 126.4375, "epoch": 0.04281509232004282, "grad_norm": 0.3841283619403839, "kl": 0.005532597075216472, "learning_rate": 1.9143468950749465e-05, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.4419417232275009, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.3125, "step": 40 }, { "completion_length": 142.625, "epoch": 0.043885469628043884, "grad_norm": 0.41511058807373047, "kl": 0.001284091005800292, "learning_rate": 1.9122055674518202e-05, "loss": 0.0001, "reward": 0.8125, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.375, "step": 41 }, { "completion_length": 137.0, "epoch": 0.04495584693604496, "grad_norm": 0.32929864525794983, "kl": 0.0019829896045848727, "learning_rate": 1.910064239828694e-05, "loss": 0.0001, "reward": 0.875, "reward_std": 0.3535533770918846, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.3125, "step": 42 }, { "completion_length": 131.5625, "epoch": 0.046026224244046024, "grad_norm": 0.2032199203968048, "kl": 0.0022189949231687933, "learning_rate": 1.9079229122055677e-05, "loss": 0.0001, "reward": 0.75, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.25, "step": 43 }, { "completion_length": 142.5625, "epoch": 0.0470966015520471, "grad_norm": 0.3018956184387207, "kl": 0.0013061394565738738, "learning_rate": 1.905781584582441e-05, "loss": 0.0001, "reward": 1.0625, "reward_std": 0.2651650421321392, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.3125, "step": 44 }, { "completion_length": 134.25, "epoch": 0.048166978860048164, "grad_norm": 0.28898701071739197, "kl": 0.0026804600056493655, "learning_rate": 1.9036402569593148e-05, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.39774755761027336, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.15625, "step": 45 }, { "completion_length": 150.0, "epoch": 0.04923735616804924, "grad_norm": 0.327416330575943, "kl": 0.0017225549090653658, "learning_rate": 1.9014989293361886e-05, "loss": 0.0001, "reward": 1.0, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 46 }, { "completion_length": 137.625, "epoch": 0.05030773347605031, "grad_norm": 0.3532528579235077, "kl": 0.0025384978798683733, "learning_rate": 1.8993576017130623e-05, "loss": 0.0001, "reward": 0.90625, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 47 }, { "completion_length": 146.4375, "epoch": 0.05137811078405138, "grad_norm": 0.41901007294654846, "kl": 0.0013859872269676998, "learning_rate": 1.8972162740899357e-05, "loss": 0.0001, "reward": 0.78125, "reward_std": 0.5745242387056351, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.34375, "step": 48 }, { "completion_length": 121.0, "epoch": 0.05244848809205245, "grad_norm": 0.34599751234054565, "kl": 0.0027808782178908587, "learning_rate": 1.8950749464668098e-05, "loss": 0.0001, "reward": 1.1875, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 49 }, { "completion_length": 139.6875, "epoch": 0.05351886540005352, "grad_norm": 0.2900560200214386, "kl": 0.0014492868795059621, "learning_rate": 1.892933618843683e-05, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 50 }, { "completion_length": 147.1875, "epoch": 0.05458924270805459, "grad_norm": 0.3093114495277405, "kl": 0.0020692676480393857, "learning_rate": 1.890792291220557e-05, "loss": 0.0001, "reward": 0.78125, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.28125, "step": 51 }, { "completion_length": 127.9375, "epoch": 0.05565962001605566, "grad_norm": 0.11066348105669022, "kl": 0.0030422286363318563, "learning_rate": 1.8886509635974306e-05, "loss": 0.0001, "reward": 0.71875, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.28125, "step": 52 }, { "completion_length": 131.875, "epoch": 0.05672999732405673, "grad_norm": 0.28639620542526245, "kl": 0.003938024019589648, "learning_rate": 1.8865096359743044e-05, "loss": 0.0002, "reward": 0.875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 53 }, { "completion_length": 135.4375, "epoch": 0.057800374632057804, "grad_norm": 0.3987920582294464, "kl": 0.0017022098327288404, "learning_rate": 1.8843683083511778e-05, "loss": 0.0001, "reward": 0.90625, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.21875, "step": 54 }, { "completion_length": 150.0, "epoch": 0.05887075194005887, "grad_norm": 0.34375810623168945, "kl": 0.00198919145623222, "learning_rate": 1.8822269807280515e-05, "loss": 0.0001, "reward": 0.96875, "reward_std": 0.39774754643440247, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.28125, "step": 55 }, { "completion_length": 141.625, "epoch": 0.05994112924805994, "grad_norm": 0.21933616697788239, "kl": 0.0018941237358376384, "learning_rate": 1.8800856531049252e-05, "loss": 0.0001, "reward": 1.03125, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 56 }, { "completion_length": 142.5, "epoch": 0.06101150655606101, "grad_norm": 0.133380725979805, "kl": 0.0024971190723590553, "learning_rate": 1.877944325481799e-05, "loss": 0.0001, "reward": 1.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 57 }, { "completion_length": 141.8125, "epoch": 0.06208188386406208, "grad_norm": 0.34304964542388916, "kl": 0.002478797105140984, "learning_rate": 1.8758029978586724e-05, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.22097086906433105, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 58 }, { "completion_length": 145.125, "epoch": 0.06315226117206316, "grad_norm": 0.24925751984119415, "kl": 0.0018571934051578864, "learning_rate": 1.873661670235546e-05, "loss": 0.0001, "reward": 0.78125, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.21875, "step": 59 }, { "completion_length": 136.4375, "epoch": 0.06422263848006422, "grad_norm": 0.21945904195308685, "kl": 0.004016537277493626, "learning_rate": 1.8715203426124198e-05, "loss": 0.0002, "reward": 0.3125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.1875, "rewards/format_reward_func_qa": 0.125, "step": 60 }, { "completion_length": 123.6875, "epoch": 0.06529301578806529, "grad_norm": 0.48852452635765076, "kl": 0.002927020424976945, "learning_rate": 1.8693790149892936e-05, "loss": 0.0001, "reward": 1.0625, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 61 }, { "completion_length": 140.4375, "epoch": 0.06636339309606637, "grad_norm": 0.12853221595287323, "kl": 0.0021651791175827384, "learning_rate": 1.8672376873661673e-05, "loss": 0.0001, "reward": 0.78125, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.28125, "step": 62 }, { "completion_length": 144.5, "epoch": 0.06743377040406744, "grad_norm": 0.3789617717266083, "kl": 0.0033225759398192167, "learning_rate": 1.865096359743041e-05, "loss": 0.0001, "reward": 0.8125, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.3125, "step": 63 }, { "completion_length": 142.8125, "epoch": 0.0685041477120685, "grad_norm": 0.21035021543502808, "kl": 0.002012499957345426, "learning_rate": 1.8629550321199144e-05, "loss": 0.0001, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 64 }, { "completion_length": 139.1875, "epoch": 0.06957452502006957, "grad_norm": 0.4082532823085785, "kl": 0.0024256658216472715, "learning_rate": 1.860813704496788e-05, "loss": 0.0001, "reward": 1.0625, "reward_std": 0.5303300805389881, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 65 }, { "completion_length": 146.8125, "epoch": 0.07064490232807065, "grad_norm": 0.3663899600505829, "kl": 0.002291058568516746, "learning_rate": 1.858672376873662e-05, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.39774755015969276, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.28125, "step": 66 }, { "completion_length": 139.3125, "epoch": 0.07171527963607172, "grad_norm": 0.21365651488304138, "kl": 0.0028595749172382057, "learning_rate": 1.8565310492505356e-05, "loss": 0.0001, "reward": 1.03125, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 67 }, { "completion_length": 142.75, "epoch": 0.07278565694407278, "grad_norm": 0.29539352655410767, "kl": 0.001882457872852683, "learning_rate": 1.854389721627409e-05, "loss": 0.0001, "reward": 1.09375, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 68 }, { "completion_length": 132.4375, "epoch": 0.07385603425207386, "grad_norm": 0.3570699989795685, "kl": 0.0034791930520441383, "learning_rate": 1.8522483940042828e-05, "loss": 0.0001, "reward": 0.90625, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 69 }, { "completion_length": 145.375, "epoch": 0.07492641156007493, "grad_norm": 0.3662557899951935, "kl": 0.0027310122968629003, "learning_rate": 1.8501070663811565e-05, "loss": 0.0001, "reward": 0.8125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.3125, "step": 70 }, { "completion_length": 130.875, "epoch": 0.075996788868076, "grad_norm": 0.32412099838256836, "kl": 0.0025520726339891553, "learning_rate": 1.8479657387580302e-05, "loss": 0.0001, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.25, "step": 71 }, { "completion_length": 138.9375, "epoch": 0.07706716617607706, "grad_norm": 0.41652974486351013, "kl": 0.002520703535992652, "learning_rate": 1.8458244111349036e-05, "loss": 0.0001, "reward": 0.9375, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 72 }, { "completion_length": 136.125, "epoch": 0.07813754348407814, "grad_norm": 0.37924128770828247, "kl": 0.003047638470889069, "learning_rate": 1.8436830835117774e-05, "loss": 0.0001, "reward": 0.9375, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.3125, "step": 73 }, { "completion_length": 139.0625, "epoch": 0.07920792079207921, "grad_norm": 0.32978716492652893, "kl": 0.0033772956812754273, "learning_rate": 1.841541755888651e-05, "loss": 0.0001, "reward": 0.96875, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 74 }, { "completion_length": 139.875, "epoch": 0.08027829810008028, "grad_norm": 0.22541023790836334, "kl": 0.003501177066937089, "learning_rate": 1.8394004282655248e-05, "loss": 0.0001, "reward": 0.96875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.34375, "step": 75 }, { "completion_length": 127.375, "epoch": 0.08134867540808134, "grad_norm": 0.3599226474761963, "kl": 0.0025884893839247525, "learning_rate": 1.8372591006423986e-05, "loss": 0.0001, "reward": 1.09375, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 76 }, { "completion_length": 144.4375, "epoch": 0.08241905271608242, "grad_norm": 0.33651289343833923, "kl": 0.0024859162513166666, "learning_rate": 1.8351177730192723e-05, "loss": 0.0001, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 77 }, { "completion_length": 141.625, "epoch": 0.08348943002408349, "grad_norm": 0.33569079637527466, "kl": 0.004442804915015586, "learning_rate": 1.8329764453961457e-05, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.34375, "step": 78 }, { "completion_length": 150.0, "epoch": 0.08455980733208456, "grad_norm": 0.17233355343341827, "kl": 0.002778201305773109, "learning_rate": 1.8308351177730194e-05, "loss": 0.0001, "reward": 0.78125, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.46875, "step": 79 }, { "completion_length": 146.1875, "epoch": 0.08563018464008564, "grad_norm": 0.2851436734199524, "kl": 0.003360766015248373, "learning_rate": 1.828693790149893e-05, "loss": 0.0001, "reward": 0.71875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.28125, "step": 80 }, { "completion_length": 140.5625, "epoch": 0.0867005619480867, "grad_norm": 0.3808906376361847, "kl": 0.0036091282381676137, "learning_rate": 1.826552462526767e-05, "loss": 0.0001, "reward": 1.15625, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.40625, "step": 81 }, { "completion_length": 140.75, "epoch": 0.08777093925608777, "grad_norm": 0.5014867782592773, "kl": 0.0030082170269452035, "learning_rate": 1.8244111349036403e-05, "loss": 0.0001, "reward": 1.09375, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 82 }, { "completion_length": 135.1875, "epoch": 0.08884131656408883, "grad_norm": 0.5195692777633667, "kl": 0.0027257396723143756, "learning_rate": 1.822269807280514e-05, "loss": 0.0001, "reward": 0.5625, "reward_std": 0.6187184266746044, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.25, "step": 83 }, { "completion_length": 137.75, "epoch": 0.08991169387208992, "grad_norm": 0.2662360668182373, "kl": 0.003949453530367464, "learning_rate": 1.8201284796573878e-05, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.28125, "step": 84 }, { "completion_length": 148.4375, "epoch": 0.09098207118009098, "grad_norm": 0.38695865869522095, "kl": 0.002344512817217037, "learning_rate": 1.8179871520342615e-05, "loss": 0.0001, "reward": 1.0, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 85 }, { "completion_length": 145.3125, "epoch": 0.09205244848809205, "grad_norm": 0.40999820828437805, "kl": 0.0024858082470018417, "learning_rate": 1.815845824411135e-05, "loss": 0.0001, "reward": 0.71875, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.28125, "step": 86 }, { "completion_length": 137.625, "epoch": 0.09312282579609313, "grad_norm": 0.390880286693573, "kl": 0.002343037282116711, "learning_rate": 1.8137044967880086e-05, "loss": 0.0001, "reward": 1.15625, "reward_std": 0.48613589257001877, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.40625, "step": 87 }, { "completion_length": 146.0, "epoch": 0.0941932031040942, "grad_norm": 0.3660587668418884, "kl": 0.0024053844390437007, "learning_rate": 1.8115631691648824e-05, "loss": 0.0001, "reward": 1.09375, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 88 }, { "completion_length": 141.875, "epoch": 0.09526358041209526, "grad_norm": 0.3368431627750397, "kl": 0.0027157479198649526, "learning_rate": 1.809421841541756e-05, "loss": 0.0001, "reward": 0.9375, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 89 }, { "completion_length": 138.9375, "epoch": 0.09633395772009633, "grad_norm": 0.20494520664215088, "kl": 0.0027889375924132764, "learning_rate": 1.8072805139186298e-05, "loss": 0.0001, "reward": 0.9375, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.3125, "step": 90 }, { "completion_length": 147.0, "epoch": 0.09740433502809741, "grad_norm": 0.4563634693622589, "kl": 0.003234080853872001, "learning_rate": 1.8051391862955036e-05, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.28125, "step": 91 }, { "completion_length": 139.75, "epoch": 0.09847471233609847, "grad_norm": 0.2723265588283539, "kl": 0.0028915356379002333, "learning_rate": 1.802997858672377e-05, "loss": 0.0001, "reward": 1.03125, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.21875, "step": 92 }, { "completion_length": 141.8125, "epoch": 0.09954508964409954, "grad_norm": 0.29708626866340637, "kl": 0.004337737511377782, "learning_rate": 1.8008565310492507e-05, "loss": 0.0002, "reward": 1.0, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 93 }, { "completion_length": 137.75, "epoch": 0.10061546695210062, "grad_norm": 0.2878601849079132, "kl": 0.003706938645336777, "learning_rate": 1.7987152034261244e-05, "loss": 0.0001, "reward": 1.34375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.46875, "step": 94 }, { "completion_length": 141.4375, "epoch": 0.10168584426010169, "grad_norm": 0.3628558814525604, "kl": 0.0030180501053109765, "learning_rate": 1.796573875802998e-05, "loss": 0.0001, "reward": 0.875, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.3125, "step": 95 }, { "completion_length": 138.875, "epoch": 0.10275622156810275, "grad_norm": 0.28441697359085083, "kl": 0.0026011289737652987, "learning_rate": 1.7944325481798716e-05, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 96 }, { "completion_length": 133.1875, "epoch": 0.10382659887610382, "grad_norm": 0.4102269113063812, "kl": 0.004514640895649791, "learning_rate": 1.7922912205567453e-05, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.6629125997424126, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 97 }, { "completion_length": 145.5, "epoch": 0.1048969761841049, "grad_norm": 0.3805445730686188, "kl": 0.00325759599218145, "learning_rate": 1.790149892933619e-05, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.28125, "step": 98 }, { "completion_length": 143.0625, "epoch": 0.10596735349210597, "grad_norm": 0.00694437837228179, "kl": 0.002611002331832424, "learning_rate": 1.7880085653104928e-05, "loss": 0.0001, "reward": 1.25, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 99 }, { "completion_length": 149.1875, "epoch": 0.10703773080010703, "grad_norm": 0.22977511584758759, "kl": 0.003544444392900914, "learning_rate": 1.785867237687366e-05, "loss": 0.0001, "reward": 0.9375, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 100 }, { "completion_length": 137.6875, "epoch": 0.10810810810810811, "grad_norm": 0.4047200381755829, "kl": 0.003274464572314173, "learning_rate": 1.78372591006424e-05, "loss": 0.0001, "reward": 0.78125, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.21875, "step": 101 }, { "completion_length": 137.3125, "epoch": 0.10917848541610918, "grad_norm": 0.34070447087287903, "kl": 0.004425200575497001, "learning_rate": 1.7815845824411136e-05, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.46875, "step": 102 }, { "completion_length": 145.5625, "epoch": 0.11024886272411025, "grad_norm": 0.27202779054641724, "kl": 0.002214608626672998, "learning_rate": 1.7794432548179874e-05, "loss": 0.0001, "reward": 1.1875, "reward_std": 0.4419417232275009, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.375, "step": 103 }, { "completion_length": 144.5625, "epoch": 0.11131924003211131, "grad_norm": 0.4576094150543213, "kl": 0.004403504892252386, "learning_rate": 1.777301927194861e-05, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 104 }, { "completion_length": 141.5, "epoch": 0.1123896173401124, "grad_norm": 0.3413035273551941, "kl": 0.002096715325023979, "learning_rate": 1.7751605995717348e-05, "loss": 0.0001, "reward": 0.90625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 105 }, { "completion_length": 147.375, "epoch": 0.11345999464811346, "grad_norm": 0.3990113139152527, "kl": 0.003911934210918844, "learning_rate": 1.7730192719486082e-05, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.5745242461562157, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 106 }, { "completion_length": 134.5625, "epoch": 0.11453037195611453, "grad_norm": 0.3869004547595978, "kl": 0.004560935776680708, "learning_rate": 1.770877944325482e-05, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.4419417269527912, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.3125, "step": 107 }, { "completion_length": 142.5625, "epoch": 0.11560074926411561, "grad_norm": 0.44793108105659485, "kl": 0.004866090719588101, "learning_rate": 1.7687366167023557e-05, "loss": 0.0002, "reward": 0.75, "reward_std": 0.3535533770918846, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.3125, "step": 108 }, { "completion_length": 145.1875, "epoch": 0.11667112657211667, "grad_norm": 0.29144322872161865, "kl": 0.012738658871967345, "learning_rate": 1.7665952890792294e-05, "loss": 0.0005, "reward": 0.71875, "reward_std": 0.30935921519994736, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.28125, "step": 109 }, { "completion_length": 144.8125, "epoch": 0.11774150388011774, "grad_norm": 0.29305094480514526, "kl": 0.0024937497510109097, "learning_rate": 1.7644539614561028e-05, "loss": 0.0001, "reward": 0.875, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 110 }, { "completion_length": 137.9375, "epoch": 0.1188118811881188, "grad_norm": 0.31856557726860046, "kl": 0.005451642500702292, "learning_rate": 1.7623126338329766e-05, "loss": 0.0002, "reward": 1.25, "reward_std": 0.2651650421321392, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 111 }, { "completion_length": 135.5, "epoch": 0.11988225849611989, "grad_norm": 0.35861292481422424, "kl": 0.0028938865289092064, "learning_rate": 1.7601713062098503e-05, "loss": 0.0001, "reward": 0.8125, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.375, "step": 112 }, { "completion_length": 147.75, "epoch": 0.12095263580412095, "grad_norm": 0.4398387670516968, "kl": 0.008004022995010018, "learning_rate": 1.758029978586724e-05, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.5303300805389881, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.3125, "step": 113 }, { "completion_length": 128.0625, "epoch": 0.12202301311212202, "grad_norm": 0.32582423090934753, "kl": 0.004148474836256355, "learning_rate": 1.7558886509635974e-05, "loss": 0.0002, "reward": 1.40625, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.9375, "rewards/format_reward_func_qa": 0.46875, "step": 114 }, { "completion_length": 146.5625, "epoch": 0.1230933904201231, "grad_norm": 0.32748767733573914, "kl": 0.004085249325726181, "learning_rate": 1.753747323340471e-05, "loss": 0.0002, "reward": 0.875, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 115 }, { "completion_length": 148.625, "epoch": 0.12416376772812417, "grad_norm": 0.2751002013683319, "kl": 0.0028387658239807934, "learning_rate": 1.751605995717345e-05, "loss": 0.0001, "reward": 0.75, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.375, "step": 116 }, { "completion_length": 140.0625, "epoch": 0.12523414503612523, "grad_norm": 0.3522147834300995, "kl": 0.0043116286396980286, "learning_rate": 1.7494646680942186e-05, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.40625, "step": 117 }, { "completion_length": 140.4375, "epoch": 0.1263045223441263, "grad_norm": 0.389006644487381, "kl": 0.0036854239006061107, "learning_rate": 1.7473233404710924e-05, "loss": 0.0001, "reward": 1.125, "reward_std": 0.4419417232275009, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.375, "step": 118 }, { "completion_length": 132.4375, "epoch": 0.12737489965212737, "grad_norm": 0.2676617205142975, "kl": 0.0036521232104860246, "learning_rate": 1.745182012847966e-05, "loss": 0.0001, "reward": 1.09375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 119 }, { "completion_length": 138.125, "epoch": 0.12844527696012845, "grad_norm": 0.24384541809558868, "kl": 0.003799416037509218, "learning_rate": 1.7430406852248395e-05, "loss": 0.0002, "reward": 0.875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 120 }, { "completion_length": 146.1875, "epoch": 0.12951565426812953, "grad_norm": 0.32482048869132996, "kl": 0.005263130064122379, "learning_rate": 1.7408993576017132e-05, "loss": 0.0002, "reward": 1.125, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.375, "step": 121 }, { "completion_length": 131.375, "epoch": 0.13058603157613058, "grad_norm": 0.33309605717658997, "kl": 0.004265371098881587, "learning_rate": 1.738758029978587e-05, "loss": 0.0002, "reward": 1.125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 122 }, { "completion_length": 149.875, "epoch": 0.13165640888413166, "grad_norm": 0.24524253606796265, "kl": 0.002822803275194019, "learning_rate": 1.7366167023554607e-05, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 123 }, { "completion_length": 146.9375, "epoch": 0.13272678619213274, "grad_norm": 0.39864134788513184, "kl": 0.003972923208493739, "learning_rate": 1.734475374732334e-05, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.34375, "step": 124 }, { "completion_length": 139.5, "epoch": 0.1337971635001338, "grad_norm": 0.22536563873291016, "kl": 0.0031395246041938663, "learning_rate": 1.7323340471092078e-05, "loss": 0.0001, "reward": 0.90625, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 125 }, { "completion_length": 136.3125, "epoch": 0.13486754080813487, "grad_norm": 0.3268199563026428, "kl": 0.003154373320285231, "learning_rate": 1.7301927194860816e-05, "loss": 0.0001, "reward": 1.21875, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 126 }, { "completion_length": 146.3125, "epoch": 0.13593791811613593, "grad_norm": 0.32845985889434814, "kl": 0.0035636365064419806, "learning_rate": 1.7280513918629553e-05, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.28125, "step": 127 }, { "completion_length": 142.625, "epoch": 0.137008295424137, "grad_norm": 0.34915241599082947, "kl": 0.004428373940754682, "learning_rate": 1.7259100642398287e-05, "loss": 0.0002, "reward": 0.75, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.3125, "step": 128 }, { "completion_length": 131.1875, "epoch": 0.13807867273213809, "grad_norm": 0.15303069353103638, "kl": 0.004605844849720597, "learning_rate": 1.7237687366167024e-05, "loss": 0.0002, "reward": 1.125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 129 }, { "completion_length": 143.1875, "epoch": 0.13914905004013914, "grad_norm": 0.21830038726329803, "kl": 0.004901626787614077, "learning_rate": 1.721627408993576e-05, "loss": 0.0002, "reward": 1.03125, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 130 }, { "completion_length": 133.125, "epoch": 0.14021942734814022, "grad_norm": 0.5472941994667053, "kl": 0.00383291975595057, "learning_rate": 1.71948608137045e-05, "loss": 0.0002, "reward": 0.65625, "reward_std": 0.39774755015969276, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.21875, "step": 131 }, { "completion_length": 140.9375, "epoch": 0.1412898046561413, "grad_norm": 0.3814188838005066, "kl": 0.003858637297526002, "learning_rate": 1.7173447537473236e-05, "loss": 0.0002, "reward": 1.25, "reward_std": 0.3535533770918846, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 132 }, { "completion_length": 144.25, "epoch": 0.14236018196414235, "grad_norm": 0.20392437279224396, "kl": 0.00272561568999663, "learning_rate": 1.7152034261241974e-05, "loss": 0.0001, "reward": 1.15625, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.34375, "step": 133 }, { "completion_length": 142.625, "epoch": 0.14343055927214343, "grad_norm": 0.43881624937057495, "kl": 0.005386390257626772, "learning_rate": 1.7130620985010707e-05, "loss": 0.0002, "reward": 1.0, "reward_std": 0.5303300768136978, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 134 }, { "completion_length": 137.625, "epoch": 0.1445009365801445, "grad_norm": 0.2348921298980713, "kl": 0.0038961140671744943, "learning_rate": 1.7109207708779445e-05, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.5, "step": 135 }, { "completion_length": 136.875, "epoch": 0.14557131388814556, "grad_norm": 0.386441707611084, "kl": 0.005566103849560022, "learning_rate": 1.7087794432548182e-05, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 136 }, { "completion_length": 139.0625, "epoch": 0.14664169119614665, "grad_norm": 0.3432142436504364, "kl": 0.009089458268135786, "learning_rate": 1.706638115631692e-05, "loss": 0.0004, "reward": 0.96875, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.34375, "step": 137 }, { "completion_length": 124.625, "epoch": 0.14771206850414773, "grad_norm": 0.2704710066318512, "kl": 0.006665578228421509, "learning_rate": 1.7044967880085653e-05, "loss": 0.0003, "reward": 0.875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.4375, "step": 138 }, { "completion_length": 136.9375, "epoch": 0.14878244581214878, "grad_norm": 0.30318889021873474, "kl": 0.005749135743826628, "learning_rate": 1.702355460385439e-05, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 139 }, { "completion_length": 143.875, "epoch": 0.14985282312014986, "grad_norm": 0.3459167182445526, "kl": 0.005650177365168929, "learning_rate": 1.7002141327623128e-05, "loss": 0.0002, "reward": 0.875, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 140 }, { "completion_length": 141.25, "epoch": 0.1509232004281509, "grad_norm": 0.31596067547798157, "kl": 0.0035053706960752606, "learning_rate": 1.6980728051391862e-05, "loss": 0.0001, "reward": 0.875, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 141 }, { "completion_length": 150.0, "epoch": 0.151993577736152, "grad_norm": 0.28568682074546814, "kl": 0.004275922197848558, "learning_rate": 1.69593147751606e-05, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.4375, "step": 142 }, { "completion_length": 143.625, "epoch": 0.15306395504415307, "grad_norm": 0.363053560256958, "kl": 0.00699933897703886, "learning_rate": 1.6937901498929337e-05, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.34375, "step": 143 }, { "completion_length": 148.625, "epoch": 0.15413433235215412, "grad_norm": 0.37071719765663147, "kl": 0.0035979216918349266, "learning_rate": 1.6916488222698074e-05, "loss": 0.0001, "reward": 1.25, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 144 }, { "completion_length": 139.1875, "epoch": 0.1552047096601552, "grad_norm": 0.2829369604587555, "kl": 0.0050278258277103305, "learning_rate": 1.689507494646681e-05, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 145 }, { "completion_length": 150.0, "epoch": 0.15627508696815628, "grad_norm": 0.2554768919944763, "kl": 0.0038810845580883324, "learning_rate": 1.687366167023555e-05, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 146 }, { "completion_length": 150.0, "epoch": 0.15734546427615734, "grad_norm": 0.3385763466358185, "kl": 0.004087230772711337, "learning_rate": 1.6852248394004286e-05, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.22097086906433105, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.28125, "step": 147 }, { "completion_length": 149.875, "epoch": 0.15841584158415842, "grad_norm": 0.31590044498443604, "kl": 0.004785956698469818, "learning_rate": 1.683083511777302e-05, "loss": 0.0002, "reward": 1.0, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 148 }, { "completion_length": 127.3125, "epoch": 0.1594862188921595, "grad_norm": 0.44572892785072327, "kl": 0.010639799758791924, "learning_rate": 1.6809421841541757e-05, "loss": 0.0004, "reward": 1.0625, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 149 }, { "completion_length": 150.0, "epoch": 0.16055659620016055, "grad_norm": 0.24803031980991364, "kl": 0.004325821704696864, "learning_rate": 1.6788008565310495e-05, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 150 }, { "completion_length": 140.6875, "epoch": 0.16162697350816163, "grad_norm": 0.31066834926605225, "kl": 0.0044672509538941085, "learning_rate": 1.6766595289079232e-05, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.46875, "step": 151 }, { "completion_length": 142.375, "epoch": 0.16269735081616268, "grad_norm": 0.2352777123451233, "kl": 0.004322368418797851, "learning_rate": 1.6745182012847966e-05, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 152 }, { "completion_length": 147.5, "epoch": 0.16376772812416376, "grad_norm": 0.3858872950077057, "kl": 0.003669942496344447, "learning_rate": 1.6723768736616703e-05, "loss": 0.0001, "reward": 0.75, "reward_std": 0.3535533882677555, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.3125, "step": 153 }, { "completion_length": 144.5625, "epoch": 0.16483810543216484, "grad_norm": 0.29432910680770874, "kl": 0.005889099091291428, "learning_rate": 1.670235546038544e-05, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.2651650421321392, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 154 }, { "completion_length": 146.0625, "epoch": 0.1659084827401659, "grad_norm": 0.1848280131816864, "kl": 0.0038604768342338502, "learning_rate": 1.6680942184154175e-05, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 155 }, { "completion_length": 145.0, "epoch": 0.16697886004816698, "grad_norm": 0.35190001130104065, "kl": 0.004090804199222475, "learning_rate": 1.6659528907922912e-05, "loss": 0.0002, "reward": 1.03125, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.46875, "step": 156 }, { "completion_length": 143.0625, "epoch": 0.16804923735616806, "grad_norm": 0.42431941628456116, "kl": 0.005506119225174189, "learning_rate": 1.663811563169165e-05, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.3535533770918846, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.3125, "step": 157 }, { "completion_length": 141.3125, "epoch": 0.1691196146641691, "grad_norm": 0.45111292600631714, "kl": 0.007150916615501046, "learning_rate": 1.6616702355460387e-05, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.34375, "step": 158 }, { "completion_length": 142.1875, "epoch": 0.1701899919721702, "grad_norm": 0.32674315571784973, "kl": 0.005725887720473111, "learning_rate": 1.6595289079229124e-05, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 159 }, { "completion_length": 141.3125, "epoch": 0.17126036928017127, "grad_norm": 0.3575911521911621, "kl": 0.006169893313199282, "learning_rate": 1.657387580299786e-05, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 160 }, { "completion_length": 144.5625, "epoch": 0.17233074658817232, "grad_norm": 0.2529573440551758, "kl": 0.008369135321117938, "learning_rate": 1.65524625267666e-05, "loss": 0.0003, "reward": 1.4375, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.9375, "rewards/format_reward_func_qa": 0.5, "step": 161 }, { "completion_length": 145.1875, "epoch": 0.1734011238961734, "grad_norm": 0.34696513414382935, "kl": 0.005210649105720222, "learning_rate": 1.6531049250535333e-05, "loss": 0.0002, "reward": 1.125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 162 }, { "completion_length": 145.5, "epoch": 0.17447150120417448, "grad_norm": 0.2668897807598114, "kl": 0.00826659012818709, "learning_rate": 1.650963597430407e-05, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 163 }, { "completion_length": 133.75, "epoch": 0.17554187851217554, "grad_norm": 0.3457169234752655, "kl": 0.010637891129590571, "learning_rate": 1.6488222698072807e-05, "loss": 0.0004, "reward": 1.03125, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 164 }, { "completion_length": 144.375, "epoch": 0.17661225582017662, "grad_norm": 0.3047502636909485, "kl": 0.00544026552233845, "learning_rate": 1.646680942184154e-05, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 165 }, { "completion_length": 137.375, "epoch": 0.17768263312817767, "grad_norm": 0.25198763608932495, "kl": 0.0067144655622541904, "learning_rate": 1.644539614561028e-05, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 166 }, { "completion_length": 144.6875, "epoch": 0.17875301043617875, "grad_norm": 0.3483259379863739, "kl": 0.004684107028879225, "learning_rate": 1.6423982869379016e-05, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 167 }, { "completion_length": 147.4375, "epoch": 0.17982338774417983, "grad_norm": 0.16461433470249176, "kl": 0.00538569176569581, "learning_rate": 1.6402569593147753e-05, "loss": 0.0002, "reward": 1.125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.375, "step": 168 }, { "completion_length": 145.8125, "epoch": 0.18089376505218088, "grad_norm": 0.326415091753006, "kl": 0.0060388309066183865, "learning_rate": 1.6381156316916487e-05, "loss": 0.0002, "reward": 1.125, "reward_std": 0.3535533770918846, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 169 }, { "completion_length": 139.0, "epoch": 0.18196414236018196, "grad_norm": 0.3119191825389862, "kl": 0.007606733008287847, "learning_rate": 1.6359743040685225e-05, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.34375, "step": 170 }, { "completion_length": 129.9375, "epoch": 0.18303451966818304, "grad_norm": 0.25784996151924133, "kl": 0.009314680006355047, "learning_rate": 1.6338329764453962e-05, "loss": 0.0004, "reward": 1.375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 171 }, { "completion_length": 150.0, "epoch": 0.1841048969761841, "grad_norm": 0.33974555134773254, "kl": 0.005115948908496648, "learning_rate": 1.63169164882227e-05, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.34375, "step": 172 }, { "completion_length": 144.375, "epoch": 0.18517527428418518, "grad_norm": 0.310312420129776, "kl": 0.006770205684006214, "learning_rate": 1.6295503211991437e-05, "loss": 0.0003, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 173 }, { "completion_length": 146.75, "epoch": 0.18624565159218626, "grad_norm": 0.18999303877353668, "kl": 0.013071838184259832, "learning_rate": 1.6274089935760174e-05, "loss": 0.0005, "reward": 1.125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 174 }, { "completion_length": 144.25, "epoch": 0.1873160289001873, "grad_norm": 0.4987257122993469, "kl": 0.00535991694778204, "learning_rate": 1.625267665952891e-05, "loss": 0.0002, "reward": 1.125, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 175 }, { "completion_length": 150.0, "epoch": 0.1883864062081884, "grad_norm": 0.31655922532081604, "kl": 0.006024509319104254, "learning_rate": 1.6231263383297645e-05, "loss": 0.0002, "reward": 1.125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 176 }, { "completion_length": 150.0, "epoch": 0.18945678351618947, "grad_norm": 0.3333751857280731, "kl": 0.004986538726370782, "learning_rate": 1.6209850107066383e-05, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 177 }, { "completion_length": 149.6875, "epoch": 0.19052716082419052, "grad_norm": 0.4376201331615448, "kl": 0.004471645574085414, "learning_rate": 1.618843683083512e-05, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.40625, "step": 178 }, { "completion_length": 141.1875, "epoch": 0.1915975381321916, "grad_norm": 0.5181428790092468, "kl": 0.011660258402116597, "learning_rate": 1.6167023554603854e-05, "loss": 0.0005, "reward": 0.75, "reward_std": 0.4419417269527912, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.375, "step": 179 }, { "completion_length": 150.0, "epoch": 0.19266791544019266, "grad_norm": 0.3482414484024048, "kl": 0.006926922942511737, "learning_rate": 1.614561027837259e-05, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.39774756133556366, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.34375, "step": 180 }, { "completion_length": 150.0, "epoch": 0.19373829274819374, "grad_norm": 0.25020086765289307, "kl": 0.00463401252636686, "learning_rate": 1.612419700214133e-05, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 181 }, { "completion_length": 128.875, "epoch": 0.19480867005619482, "grad_norm": 0.19893690943717957, "kl": 0.014308148063719273, "learning_rate": 1.6102783725910066e-05, "loss": 0.0006, "reward": 0.96875, "reward_std": 0.22097086906433105, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.34375, "step": 182 }, { "completion_length": 148.875, "epoch": 0.19587904736419587, "grad_norm": 0.23766586184501648, "kl": 0.006266304990276694, "learning_rate": 1.60813704496788e-05, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 183 }, { "completion_length": 140.0625, "epoch": 0.19694942467219695, "grad_norm": 0.27007901668548584, "kl": 0.007623352110385895, "learning_rate": 1.6059957173447537e-05, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.46875, "step": 184 }, { "completion_length": 146.625, "epoch": 0.19801980198019803, "grad_norm": 0.2326001524925232, "kl": 0.00440504786092788, "learning_rate": 1.6038543897216275e-05, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 185 }, { "completion_length": 145.5625, "epoch": 0.19909017928819908, "grad_norm": 0.14321264624595642, "kl": 0.0051883000414818525, "learning_rate": 1.6017130620985012e-05, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 186 }, { "completion_length": 150.0, "epoch": 0.20016055659620016, "grad_norm": 0.24787946045398712, "kl": 0.0037101278721820563, "learning_rate": 1.599571734475375e-05, "loss": 0.0001, "reward": 0.46875, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.15625, "step": 187 }, { "completion_length": 145.6875, "epoch": 0.20123093390420124, "grad_norm": 0.2701970338821411, "kl": 0.006797288311645389, "learning_rate": 1.5974304068522487e-05, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 188 }, { "completion_length": 144.9375, "epoch": 0.2023013112122023, "grad_norm": 0.32277870178222656, "kl": 0.006620479631237686, "learning_rate": 1.595289079229122e-05, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 189 }, { "completion_length": 146.5625, "epoch": 0.20337168852020338, "grad_norm": 0.37462103366851807, "kl": 0.008417986333370209, "learning_rate": 1.5931477516059958e-05, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 190 }, { "completion_length": 132.25, "epoch": 0.20444206582820446, "grad_norm": 0.2718920111656189, "kl": 0.01220483216457069, "learning_rate": 1.5910064239828695e-05, "loss": 0.0005, "reward": 0.84375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.46875, "step": 191 }, { "completion_length": 142.5625, "epoch": 0.2055124431362055, "grad_norm": 0.23734301328659058, "kl": 0.006222203490324318, "learning_rate": 1.5888650963597433e-05, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 192 }, { "completion_length": 146.875, "epoch": 0.2065828204442066, "grad_norm": 0.329610139131546, "kl": 0.009306765859946609, "learning_rate": 1.5867237687366167e-05, "loss": 0.0004, "reward": 1.25, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 193 }, { "completion_length": 135.75, "epoch": 0.20765319775220764, "grad_norm": 0.3424364924430847, "kl": 0.007034900947473943, "learning_rate": 1.5845824411134904e-05, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 194 }, { "completion_length": 150.0, "epoch": 0.20872357506020872, "grad_norm": 0.012570245191454887, "kl": 0.006533449748530984, "learning_rate": 1.582441113490364e-05, "loss": 0.0003, "reward": 0.875, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 195 }, { "completion_length": 146.0, "epoch": 0.2097939523682098, "grad_norm": 0.2770825922489166, "kl": 0.008247710415162146, "learning_rate": 1.580299785867238e-05, "loss": 0.0003, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 196 }, { "completion_length": 150.0, "epoch": 0.21086432967621085, "grad_norm": 0.3455538749694824, "kl": 0.005142325651831925, "learning_rate": 1.5781584582441113e-05, "loss": 0.0002, "reward": 0.75, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.375, "step": 197 }, { "completion_length": 139.0625, "epoch": 0.21193470698421193, "grad_norm": 0.359478235244751, "kl": 0.006735144765116274, "learning_rate": 1.576017130620985e-05, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.5, "step": 198 }, { "completion_length": 126.5625, "epoch": 0.21300508429221301, "grad_norm": 0.34195834398269653, "kl": 0.01025110692717135, "learning_rate": 1.5738758029978587e-05, "loss": 0.0004, "reward": 1.34375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.46875, "step": 199 }, { "completion_length": 146.9375, "epoch": 0.21407546160021407, "grad_norm": 0.3972872197628021, "kl": 0.008201245334930718, "learning_rate": 1.5717344753747325e-05, "loss": 0.0003, "reward": 1.125, "reward_std": 0.3535533882677555, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 200 }, { "completion_length": 141.125, "epoch": 0.21514583890821515, "grad_norm": 0.31137824058532715, "kl": 0.011072686640545726, "learning_rate": 1.5695931477516062e-05, "loss": 0.0004, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 201 }, { "completion_length": 139.75, "epoch": 0.21621621621621623, "grad_norm": 0.3144007921218872, "kl": 0.007215305580757558, "learning_rate": 1.56745182012848e-05, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.30935921519994736, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 202 }, { "completion_length": 144.5625, "epoch": 0.21728659352421728, "grad_norm": 0.3361535966396332, "kl": 0.006086603680159897, "learning_rate": 1.5653104925053533e-05, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.46875, "step": 203 }, { "completion_length": 143.6875, "epoch": 0.21835697083221836, "grad_norm": 0.5181671380996704, "kl": 0.007427173666656017, "learning_rate": 1.563169164882227e-05, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 204 }, { "completion_length": 149.625, "epoch": 0.2194273481402194, "grad_norm": 0.11053427308797836, "kl": 0.0057265578070655465, "learning_rate": 1.5610278372591008e-05, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 205 }, { "completion_length": 137.625, "epoch": 0.2204977254482205, "grad_norm": 0.318547785282135, "kl": 0.008856757776811719, "learning_rate": 1.5588865096359745e-05, "loss": 0.0004, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.5, "step": 206 }, { "completion_length": 150.0, "epoch": 0.22156810275622157, "grad_norm": 0.4614088535308838, "kl": 0.0071030791150406, "learning_rate": 1.556745182012848e-05, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.6187184229493141, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 207 }, { "completion_length": 150.0, "epoch": 0.22263848006422263, "grad_norm": 0.289823979139328, "kl": 0.00738685205578804, "learning_rate": 1.5546038543897217e-05, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.34375, "step": 208 }, { "completion_length": 139.625, "epoch": 0.2237088573722237, "grad_norm": 0.18911366164684296, "kl": 0.005441827932372689, "learning_rate": 1.5524625267665954e-05, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 209 }, { "completion_length": 143.3125, "epoch": 0.2247792346802248, "grad_norm": 0.420041024684906, "kl": 0.006481884163804352, "learning_rate": 1.550321199143469e-05, "loss": 0.0003, "reward": 1.125, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.375, "step": 210 }, { "completion_length": 149.75, "epoch": 0.22584961198822584, "grad_norm": 0.4482715427875519, "kl": 0.008336510160006583, "learning_rate": 1.5481798715203425e-05, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.30935921519994736, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 211 }, { "completion_length": 150.0, "epoch": 0.22691998929622692, "grad_norm": 0.2646245062351227, "kl": 0.007252135954331607, "learning_rate": 1.5460385438972163e-05, "loss": 0.0003, "reward": 0.53125, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.25, "rewards/format_reward_func_qa": 0.28125, "step": 212 }, { "completion_length": 143.0, "epoch": 0.227990366604228, "grad_norm": 0.3131590187549591, "kl": 0.0077580115757882595, "learning_rate": 1.54389721627409e-05, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 213 }, { "completion_length": 145.9375, "epoch": 0.22906074391222905, "grad_norm": 0.2903270125389099, "kl": 0.008237255970016122, "learning_rate": 1.5417558886509637e-05, "loss": 0.0003, "reward": 0.75, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.375, "step": 214 }, { "completion_length": 142.3125, "epoch": 0.23013112122023013, "grad_norm": 0.14063481986522675, "kl": 0.007097553112544119, "learning_rate": 1.5396145610278375e-05, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 215 }, { "completion_length": 145.25, "epoch": 0.23120149852823121, "grad_norm": 0.020205190405249596, "kl": 0.009018055512569845, "learning_rate": 1.5374732334047112e-05, "loss": 0.0004, "reward": 1.0, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.5, "step": 216 }, { "completion_length": 150.0, "epoch": 0.23227187583623227, "grad_norm": 0.4468995928764343, "kl": 0.005243368388619274, "learning_rate": 1.5353319057815846e-05, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.3535533882677555, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 217 }, { "completion_length": 144.375, "epoch": 0.23334225314423335, "grad_norm": 0.4661354720592499, "kl": 0.00629305059555918, "learning_rate": 1.5331905781584583e-05, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.4375, "step": 218 }, { "completion_length": 150.0, "epoch": 0.2344126304522344, "grad_norm": 0.2500506043434143, "kl": 0.0055136485025286674, "learning_rate": 1.531049250535332e-05, "loss": 0.0002, "reward": 1.375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 219 }, { "completion_length": 140.9375, "epoch": 0.23548300776023548, "grad_norm": 0.37934842705726624, "kl": 0.006019769352860749, "learning_rate": 1.5289079229122058e-05, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 220 }, { "completion_length": 135.75, "epoch": 0.23655338506823656, "grad_norm": 0.38555335998535156, "kl": 0.007120768656022847, "learning_rate": 1.5267665952890792e-05, "loss": 0.0003, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 221 }, { "completion_length": 150.0, "epoch": 0.2376237623762376, "grad_norm": 0.2715327739715576, "kl": 0.007476891158148646, "learning_rate": 1.524625267665953e-05, "loss": 0.0003, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 222 }, { "completion_length": 144.0625, "epoch": 0.2386941396842387, "grad_norm": 0.3015255033969879, "kl": 0.007314686547033489, "learning_rate": 1.5224839400428267e-05, "loss": 0.0003, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.5, "step": 223 }, { "completion_length": 144.625, "epoch": 0.23976451699223977, "grad_norm": 0.13369715213775635, "kl": 0.00593476346693933, "learning_rate": 1.5203426124197002e-05, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.40625, "step": 224 }, { "completion_length": 140.1875, "epoch": 0.24083489430024083, "grad_norm": 0.37559810280799866, "kl": 0.005120923509821296, "learning_rate": 1.518201284796574e-05, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 225 }, { "completion_length": 140.625, "epoch": 0.2419052716082419, "grad_norm": 0.44624948501586914, "kl": 0.006767567712813616, "learning_rate": 1.5160599571734475e-05, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.3535533770918846, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.375, "step": 226 }, { "completion_length": 119.5625, "epoch": 0.242975648916243, "grad_norm": 0.3014875054359436, "kl": 0.00768728950060904, "learning_rate": 1.5139186295503214e-05, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.40625, "step": 227 }, { "completion_length": 141.0625, "epoch": 0.24404602622424404, "grad_norm": 0.5212093591690063, "kl": 0.006814891821704805, "learning_rate": 1.511777301927195e-05, "loss": 0.0003, "reward": 0.875, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 228 }, { "completion_length": 150.0, "epoch": 0.24511640353224512, "grad_norm": 0.38750529289245605, "kl": 0.005428965087048709, "learning_rate": 1.5096359743040687e-05, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 229 }, { "completion_length": 138.75, "epoch": 0.2461867808402462, "grad_norm": 0.4885742962360382, "kl": 0.008686224464327097, "learning_rate": 1.5074946466809423e-05, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 230 }, { "completion_length": 140.3125, "epoch": 0.24725715814824725, "grad_norm": 0.3139186501502991, "kl": 0.0069216901902109385, "learning_rate": 1.505353319057816e-05, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.40625, "step": 231 }, { "completion_length": 144.625, "epoch": 0.24832753545624833, "grad_norm": 0.33277660608291626, "kl": 0.005106811091536656, "learning_rate": 1.5032119914346896e-05, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.25, "step": 232 }, { "completion_length": 131.125, "epoch": 0.24939791276424939, "grad_norm": 0.4156568646430969, "kl": 0.006144103594124317, "learning_rate": 1.5010706638115633e-05, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 233 }, { "completion_length": 136.875, "epoch": 0.25046829007225047, "grad_norm": 0.33087489008903503, "kl": 0.00909916777163744, "learning_rate": 1.4989293361884369e-05, "loss": 0.0004, "reward": 1.0, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.3125, "step": 234 }, { "completion_length": 150.0, "epoch": 0.25153866738025155, "grad_norm": 0.3908926248550415, "kl": 0.0045427910081343725, "learning_rate": 1.4967880085653106e-05, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.3125, "step": 235 }, { "completion_length": 146.0625, "epoch": 0.2526090446882526, "grad_norm": 0.2080722153186798, "kl": 0.006125298328697681, "learning_rate": 1.4946466809421842e-05, "loss": 0.0002, "reward": 1.125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 236 }, { "completion_length": 150.0, "epoch": 0.2536794219962537, "grad_norm": 0.3016617000102997, "kl": 0.004153968533501029, "learning_rate": 1.492505353319058e-05, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 237 }, { "completion_length": 144.6875, "epoch": 0.25474979930425473, "grad_norm": 0.32588091492652893, "kl": 0.0038490865263156593, "learning_rate": 1.4903640256959315e-05, "loss": 0.0002, "reward": 1.375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 238 }, { "completion_length": 143.4375, "epoch": 0.2558201766122558, "grad_norm": 0.19365452229976654, "kl": 0.00536590488627553, "learning_rate": 1.4882226980728052e-05, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 239 }, { "completion_length": 139.6875, "epoch": 0.2568905539202569, "grad_norm": 0.395223468542099, "kl": 0.0069751954870298505, "learning_rate": 1.4860813704496788e-05, "loss": 0.0003, "reward": 1.125, "reward_std": 0.5303300693631172, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 240 }, { "completion_length": 135.8125, "epoch": 0.257960931228258, "grad_norm": 0.2731679081916809, "kl": 0.006456379080191255, "learning_rate": 1.4839400428265527e-05, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.30935921519994736, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 241 }, { "completion_length": 150.0, "epoch": 0.25903130853625905, "grad_norm": 0.2755889296531677, "kl": 0.0049463664763607085, "learning_rate": 1.4817987152034263e-05, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 242 }, { "completion_length": 150.0, "epoch": 0.2601016858442601, "grad_norm": 0.26052120327949524, "kl": 0.004901221487671137, "learning_rate": 1.4796573875803e-05, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.34375, "step": 243 }, { "completion_length": 142.125, "epoch": 0.26117206315226116, "grad_norm": 0.23377935588359833, "kl": 0.009784531430341303, "learning_rate": 1.4775160599571736e-05, "loss": 0.0004, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 244 }, { "completion_length": 150.0, "epoch": 0.26224244046026224, "grad_norm": 0.4108826518058777, "kl": 0.009884261758998036, "learning_rate": 1.4753747323340473e-05, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 245 }, { "completion_length": 138.125, "epoch": 0.2633128177682633, "grad_norm": 0.48919036984443665, "kl": 0.005193000833969563, "learning_rate": 1.4732334047109209e-05, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.375, "step": 246 }, { "completion_length": 131.625, "epoch": 0.2643831950762644, "grad_norm": 0.45731356739997864, "kl": 0.008820455172099173, "learning_rate": 1.4710920770877946e-05, "loss": 0.0004, "reward": 1.1875, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.5, "step": 247 }, { "completion_length": 125.125, "epoch": 0.2654535723842655, "grad_norm": 0.37961962819099426, "kl": 0.008322751265950501, "learning_rate": 1.4689507494646682e-05, "loss": 0.0003, "reward": 1.0, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 248 }, { "completion_length": 146.625, "epoch": 0.2665239496922665, "grad_norm": 0.2519012689590454, "kl": 0.00586779706645757, "learning_rate": 1.4668094218415419e-05, "loss": 0.0002, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.5, "step": 249 }, { "completion_length": 144.375, "epoch": 0.2675943270002676, "grad_norm": 0.3722406029701233, "kl": 0.010160791454836726, "learning_rate": 1.4646680942184155e-05, "loss": 0.0004, "reward": 1.21875, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 250 }, { "completion_length": 139.125, "epoch": 0.26866470430826866, "grad_norm": 0.3627791404724121, "kl": 0.01073427964001894, "learning_rate": 1.4625267665952892e-05, "loss": 0.0004, "reward": 1.09375, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 251 }, { "completion_length": 146.0, "epoch": 0.26973508161626975, "grad_norm": 0.40180522203445435, "kl": 0.006828202400356531, "learning_rate": 1.4603854389721628e-05, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 252 }, { "completion_length": 149.3125, "epoch": 0.2708054589242708, "grad_norm": 0.23654454946517944, "kl": 0.007191521872300655, "learning_rate": 1.4582441113490365e-05, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 253 }, { "completion_length": 147.125, "epoch": 0.27187583623227185, "grad_norm": 0.19529981911182404, "kl": 0.0048531428328715265, "learning_rate": 1.45610278372591e-05, "loss": 0.0002, "reward": 0.75, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.3125, "step": 254 }, { "completion_length": 147.75, "epoch": 0.27294621354027293, "grad_norm": 0.38939473032951355, "kl": 0.007395104563329369, "learning_rate": 1.453961456102784e-05, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 255 }, { "completion_length": 130.9375, "epoch": 0.274016590848274, "grad_norm": 0.3886270523071289, "kl": 0.01682290097232908, "learning_rate": 1.4518201284796575e-05, "loss": 0.0007, "reward": 0.71875, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.34375, "step": 256 }, { "completion_length": 135.3125, "epoch": 0.2750869681562751, "grad_norm": 0.39373981952667236, "kl": 0.01281587965786457, "learning_rate": 1.4496788008565313e-05, "loss": 0.0005, "reward": 0.875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.4375, "step": 257 }, { "completion_length": 129.75, "epoch": 0.27615734546427617, "grad_norm": 0.012575002387166023, "kl": 0.005714049795642495, "learning_rate": 1.4475374732334048e-05, "loss": 0.0002, "reward": 1.125, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.375, "step": 258 }, { "completion_length": 141.4375, "epoch": 0.27722772277227725, "grad_norm": 0.21375897526741028, "kl": 0.005977701861411333, "learning_rate": 1.4453961456102786e-05, "loss": 0.0002, "reward": 0.875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 259 }, { "completion_length": 139.5, "epoch": 0.2782981000802783, "grad_norm": 0.3263048529624939, "kl": 0.006977385259233415, "learning_rate": 1.4432548179871521e-05, "loss": 0.0003, "reward": 1.375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 260 }, { "completion_length": 137.4375, "epoch": 0.27936847738827936, "grad_norm": 0.3623965084552765, "kl": 0.011550009832717478, "learning_rate": 1.4411134903640259e-05, "loss": 0.0005, "reward": 0.75, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.4375, "step": 261 }, { "completion_length": 143.75, "epoch": 0.28043885469628044, "grad_norm": 0.42785611748695374, "kl": 0.012586190830916166, "learning_rate": 1.4389721627408994e-05, "loss": 0.0005, "reward": 1.03125, "reward_std": 0.39774756133556366, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 262 }, { "completion_length": 141.0625, "epoch": 0.2815092320042815, "grad_norm": 0.3989890515804291, "kl": 0.01055932673625648, "learning_rate": 1.4368308351177732e-05, "loss": 0.0004, "reward": 1.09375, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 263 }, { "completion_length": 137.0625, "epoch": 0.2825796093122826, "grad_norm": 0.3878323435783386, "kl": 0.00601056485902518, "learning_rate": 1.4346895074946467e-05, "loss": 0.0002, "reward": 1.0, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 264 }, { "completion_length": 147.875, "epoch": 0.2836499866202836, "grad_norm": 0.14455240964889526, "kl": 0.006071401759982109, "learning_rate": 1.4325481798715205e-05, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.34375, "step": 265 }, { "completion_length": 126.25, "epoch": 0.2847203639282847, "grad_norm": 0.016269944608211517, "kl": 0.006944146240130067, "learning_rate": 1.430406852248394e-05, "loss": 0.0003, "reward": 1.375, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 266 }, { "completion_length": 150.0, "epoch": 0.2857907412362858, "grad_norm": 0.35848063230514526, "kl": 0.004755255416966975, "learning_rate": 1.4282655246252678e-05, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.28125, "step": 267 }, { "completion_length": 147.3125, "epoch": 0.28686111854428686, "grad_norm": 0.2541021406650543, "kl": 0.008224257617257535, "learning_rate": 1.4261241970021415e-05, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.46875, "step": 268 }, { "completion_length": 146.875, "epoch": 0.28793149585228794, "grad_norm": 0.2954241633415222, "kl": 0.004365060536656529, "learning_rate": 1.4239828693790152e-05, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 269 }, { "completion_length": 144.8125, "epoch": 0.289001873160289, "grad_norm": 0.29421886801719666, "kl": 0.0053277366096153855, "learning_rate": 1.4218415417558888e-05, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 270 }, { "completion_length": 150.0, "epoch": 0.29007225046829005, "grad_norm": 0.4748411476612091, "kl": 0.00636588130146265, "learning_rate": 1.4197002141327625e-05, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.5745242461562157, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.46875, "step": 271 }, { "completion_length": 140.4375, "epoch": 0.29114262777629113, "grad_norm": 0.4254905581474304, "kl": 0.008284647250548005, "learning_rate": 1.4175588865096361e-05, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.48613589257001877, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 272 }, { "completion_length": 150.0, "epoch": 0.2922130050842922, "grad_norm": 0.42788538336753845, "kl": 0.006597750179935247, "learning_rate": 1.4154175588865098e-05, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.5745242387056351, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 273 }, { "completion_length": 148.75, "epoch": 0.2932833823922933, "grad_norm": 0.6270278692245483, "kl": 0.0063065249705687165, "learning_rate": 1.4132762312633834e-05, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.30935921519994736, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.40625, "step": 274 }, { "completion_length": 138.625, "epoch": 0.29435375970029437, "grad_norm": 0.37839624285697937, "kl": 0.012363988615106791, "learning_rate": 1.4111349036402571e-05, "loss": 0.0005, "reward": 0.84375, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.40625, "step": 275 }, { "completion_length": 140.0, "epoch": 0.29542413700829545, "grad_norm": 0.3272175192832947, "kl": 0.006654585944488645, "learning_rate": 1.4089935760171307e-05, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 276 }, { "completion_length": 146.125, "epoch": 0.2964945143162965, "grad_norm": 0.31905505061149597, "kl": 0.003707013325765729, "learning_rate": 1.4068522483940044e-05, "loss": 0.0001, "reward": 1.15625, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 277 }, { "completion_length": 146.3125, "epoch": 0.29756489162429756, "grad_norm": 0.35835763812065125, "kl": 0.007482488756068051, "learning_rate": 1.404710920770878e-05, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.40625, "step": 278 }, { "completion_length": 150.0, "epoch": 0.29863526893229864, "grad_norm": 0.293131560087204, "kl": 0.005498936166986823, "learning_rate": 1.4025695931477517e-05, "loss": 0.0002, "reward": 0.875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 279 }, { "completion_length": 143.6875, "epoch": 0.2997056462402997, "grad_norm": 0.3765776753425598, "kl": 0.009784213732928038, "learning_rate": 1.4004282655246253e-05, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 280 }, { "completion_length": 149.6875, "epoch": 0.3007760235483008, "grad_norm": 0.4626339375972748, "kl": 0.01028113008942455, "learning_rate": 1.3982869379014989e-05, "loss": 0.0004, "reward": 0.9375, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 281 }, { "completion_length": 150.0, "epoch": 0.3018464008563018, "grad_norm": 0.3850708305835724, "kl": 0.00940094207180664, "learning_rate": 1.3961456102783728e-05, "loss": 0.0004, "reward": 0.875, "reward_std": 0.4419417269527912, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 282 }, { "completion_length": 146.75, "epoch": 0.3029167781643029, "grad_norm": 0.27950742840766907, "kl": 0.008149268687702715, "learning_rate": 1.3940042826552465e-05, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 283 }, { "completion_length": 145.9375, "epoch": 0.303987155472304, "grad_norm": 0.3828939199447632, "kl": 0.00602063094265759, "learning_rate": 1.39186295503212e-05, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.39774756133556366, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 284 }, { "completion_length": 143.9375, "epoch": 0.30505753278030506, "grad_norm": 0.47909483313560486, "kl": 0.008889288757927716, "learning_rate": 1.3897216274089938e-05, "loss": 0.0004, "reward": 0.9375, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.4375, "step": 285 }, { "completion_length": 143.1875, "epoch": 0.30612791008830614, "grad_norm": 0.33092212677001953, "kl": 0.009997703600674868, "learning_rate": 1.3875802997858674e-05, "loss": 0.0004, "reward": 1.0, "reward_std": 0.2651650309562683, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 286 }, { "completion_length": 140.4375, "epoch": 0.3071982873963072, "grad_norm": 0.15673407912254333, "kl": 0.004977605305612087, "learning_rate": 1.3854389721627411e-05, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 287 }, { "completion_length": 150.0, "epoch": 0.30826866470430825, "grad_norm": 0.47202402353286743, "kl": 0.00977489206707105, "learning_rate": 1.3832976445396147e-05, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.662912592291832, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 288 }, { "completion_length": 150.0, "epoch": 0.30933904201230933, "grad_norm": 0.40252622961997986, "kl": 0.015889974543824792, "learning_rate": 1.3811563169164884e-05, "loss": 0.0006, "reward": 0.96875, "reward_std": 0.48613589257001877, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.34375, "step": 289 }, { "completion_length": 137.4375, "epoch": 0.3104094193203104, "grad_norm": 0.44443419575691223, "kl": 0.007967816665768623, "learning_rate": 1.379014989293362e-05, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.39774754643440247, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.34375, "step": 290 }, { "completion_length": 147.3125, "epoch": 0.3114797966283115, "grad_norm": 0.2977951765060425, "kl": 0.006096803059335798, "learning_rate": 1.3768736616702357e-05, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 291 }, { "completion_length": 146.3125, "epoch": 0.31255017393631257, "grad_norm": 0.4899381399154663, "kl": 0.008720402489416301, "learning_rate": 1.3747323340471093e-05, "loss": 0.0003, "reward": 0.75, "reward_std": 0.4419417344033718, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.3125, "step": 292 }, { "completion_length": 150.0, "epoch": 0.3136205512443136, "grad_norm": 0.24690474569797516, "kl": 0.007464456313755363, "learning_rate": 1.3725910064239828e-05, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.22097086906433105, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 293 }, { "completion_length": 133.8125, "epoch": 0.3146909285523147, "grad_norm": 0.3298121690750122, "kl": 0.004133240261580795, "learning_rate": 1.3704496788008566e-05, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.46875, "step": 294 }, { "completion_length": 147.6875, "epoch": 0.31576130586031576, "grad_norm": 0.5661970973014832, "kl": 0.007168868207372725, "learning_rate": 1.3683083511777301e-05, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.5, "step": 295 }, { "completion_length": 149.6875, "epoch": 0.31683168316831684, "grad_norm": 0.30463680624961853, "kl": 0.004781250900123268, "learning_rate": 1.366167023554604e-05, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.3535533770918846, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 296 }, { "completion_length": 145.375, "epoch": 0.3179020604763179, "grad_norm": 0.3733966648578644, "kl": 0.006240170914679766, "learning_rate": 1.3640256959314778e-05, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 297 }, { "completion_length": 144.5625, "epoch": 0.318972437784319, "grad_norm": 0.4352051317691803, "kl": 0.004638432990759611, "learning_rate": 1.3618843683083513e-05, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.5303300768136978, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.375, "step": 298 }, { "completion_length": 129.5625, "epoch": 0.32004281509232, "grad_norm": 0.29963627457618713, "kl": 0.005788733251392841, "learning_rate": 1.359743040685225e-05, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.4375, "step": 299 }, { "completion_length": 142.625, "epoch": 0.3211131924003211, "grad_norm": 0.24243998527526855, "kl": 0.005643818876706064, "learning_rate": 1.3576017130620986e-05, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.2651650309562683, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 300 }, { "completion_length": 149.375, "epoch": 0.3221835697083222, "grad_norm": 0.42378368973731995, "kl": 0.007364020450040698, "learning_rate": 1.3554603854389724e-05, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 301 }, { "completion_length": 133.375, "epoch": 0.32325394701632326, "grad_norm": 0.2463878095149994, "kl": 0.007684101001359522, "learning_rate": 1.353319057815846e-05, "loss": 0.0003, "reward": 1.34375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.46875, "step": 302 }, { "completion_length": 147.3125, "epoch": 0.32432432432432434, "grad_norm": 0.40979817509651184, "kl": 0.005909224855713546, "learning_rate": 1.3511777301927197e-05, "loss": 0.0002, "reward": 0.875, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.4375, "step": 303 }, { "completion_length": 142.0, "epoch": 0.32539470163232537, "grad_norm": 0.3874134421348572, "kl": 0.0058472700184211135, "learning_rate": 1.3490364025695932e-05, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.4419417232275009, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.4375, "step": 304 }, { "completion_length": 137.0625, "epoch": 0.32646507894032645, "grad_norm": 0.3694112300872803, "kl": 0.006664227577857673, "learning_rate": 1.3468950749464668e-05, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 305 }, { "completion_length": 144.6875, "epoch": 0.32753545624832753, "grad_norm": 0.3127671480178833, "kl": 0.004910006944555789, "learning_rate": 1.3447537473233405e-05, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 306 }, { "completion_length": 137.875, "epoch": 0.3286058335563286, "grad_norm": 0.36843863129615784, "kl": 0.009545507375150919, "learning_rate": 1.3426124197002141e-05, "loss": 0.0004, "reward": 0.96875, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 307 }, { "completion_length": 137.75, "epoch": 0.3296762108643297, "grad_norm": 0.18634013831615448, "kl": 0.0045433740597218275, "learning_rate": 1.3404710920770878e-05, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 308 }, { "completion_length": 143.8125, "epoch": 0.33074658817233077, "grad_norm": 0.13969561457633972, "kl": 0.006011317833326757, "learning_rate": 1.3383297644539614e-05, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.9375, "rewards/format_reward_func_qa": 0.5, "step": 309 }, { "completion_length": 136.3125, "epoch": 0.3318169654803318, "grad_norm": 0.3590127229690552, "kl": 0.005873505957424641, "learning_rate": 1.3361884368308353e-05, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 310 }, { "completion_length": 143.1875, "epoch": 0.3328873427883329, "grad_norm": 0.00856278371065855, "kl": 0.004575320926960558, "learning_rate": 1.334047109207709e-05, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 311 }, { "completion_length": 143.5625, "epoch": 0.33395772009633395, "grad_norm": 0.19175350666046143, "kl": 0.0046806473401375115, "learning_rate": 1.3319057815845826e-05, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 312 }, { "completion_length": 145.25, "epoch": 0.33502809740433503, "grad_norm": 0.272227942943573, "kl": 0.007119390647858381, "learning_rate": 1.3297644539614563e-05, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 313 }, { "completion_length": 136.75, "epoch": 0.3360984747123361, "grad_norm": 0.4292297065258026, "kl": 0.007977791363373399, "learning_rate": 1.3276231263383299e-05, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.39774756133556366, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 314 }, { "completion_length": 144.25, "epoch": 0.3371688520203372, "grad_norm": 0.3078269064426422, "kl": 0.004801296396180987, "learning_rate": 1.3254817987152036e-05, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 315 }, { "completion_length": 145.125, "epoch": 0.3382392293283382, "grad_norm": 0.3284446597099304, "kl": 0.007507419097237289, "learning_rate": 1.3233404710920772e-05, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 316 }, { "completion_length": 135.0625, "epoch": 0.3393096066363393, "grad_norm": 0.34958720207214355, "kl": 0.010786401107907295, "learning_rate": 1.3211991434689508e-05, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.46875, "step": 317 }, { "completion_length": 135.875, "epoch": 0.3403799839443404, "grad_norm": 0.42059585452079773, "kl": 0.01110428397078067, "learning_rate": 1.3190578158458245e-05, "loss": 0.0004, "reward": 0.96875, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 318 }, { "completion_length": 142.5, "epoch": 0.34145036125234146, "grad_norm": 0.3933338224887848, "kl": 0.008363090106286108, "learning_rate": 1.316916488222698e-05, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.30935921519994736, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 319 }, { "completion_length": 142.3125, "epoch": 0.34252073856034254, "grad_norm": 0.4177909195423126, "kl": 0.00782510288991034, "learning_rate": 1.3147751605995718e-05, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 320 }, { "completion_length": 138.1875, "epoch": 0.34359111586834357, "grad_norm": 0.2197863906621933, "kl": 0.008472184767015278, "learning_rate": 1.3126338329764454e-05, "loss": 0.0003, "reward": 1.375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 321 }, { "completion_length": 138.625, "epoch": 0.34466149317634465, "grad_norm": 0.24823102355003357, "kl": 0.007359960814937949, "learning_rate": 1.3104925053533191e-05, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 322 }, { "completion_length": 147.75, "epoch": 0.3457318704843457, "grad_norm": 0.24254311621189117, "kl": 0.004848449782002717, "learning_rate": 1.308351177730193e-05, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 323 }, { "completion_length": 139.4375, "epoch": 0.3468022477923468, "grad_norm": 0.30011963844299316, "kl": 0.005031373351812363, "learning_rate": 1.3062098501070666e-05, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.375, "step": 324 }, { "completion_length": 141.6875, "epoch": 0.3478726251003479, "grad_norm": 0.21622057259082794, "kl": 0.00858684559352696, "learning_rate": 1.3040685224839403e-05, "loss": 0.0003, "reward": 1.3125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 325 }, { "completion_length": 141.1875, "epoch": 0.34894300240834897, "grad_norm": 0.3546992242336273, "kl": 0.009838977653998882, "learning_rate": 1.3019271948608139e-05, "loss": 0.0004, "reward": 0.96875, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 326 }, { "completion_length": 137.3125, "epoch": 0.35001337971635, "grad_norm": 0.48192381858825684, "kl": 0.013976672198623419, "learning_rate": 1.2997858672376876e-05, "loss": 0.0006, "reward": 1.09375, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 327 }, { "completion_length": 148.25, "epoch": 0.3510837570243511, "grad_norm": 0.5347940325737, "kl": 0.008206738741137087, "learning_rate": 1.2976445396145612e-05, "loss": 0.0003, "reward": 1.125, "reward_std": 0.3535533770918846, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 328 }, { "completion_length": 136.625, "epoch": 0.35215413433235215, "grad_norm": 0.4320046603679657, "kl": 0.018715620622970164, "learning_rate": 1.2955032119914347e-05, "loss": 0.0007, "reward": 1.0625, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 329 }, { "completion_length": 129.875, "epoch": 0.35322451164035323, "grad_norm": 0.4408765137195587, "kl": 0.007698336499743164, "learning_rate": 1.2933618843683085e-05, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.48613590002059937, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 330 }, { "completion_length": 136.1875, "epoch": 0.3542948889483543, "grad_norm": 0.32450464367866516, "kl": 0.008607505704276264, "learning_rate": 1.291220556745182e-05, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.40625, "step": 331 }, { "completion_length": 129.8125, "epoch": 0.35536526625635534, "grad_norm": 0.48471125960350037, "kl": 0.011293674702756107, "learning_rate": 1.2890792291220558e-05, "loss": 0.0005, "reward": 0.9375, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.5, "step": 332 }, { "completion_length": 136.375, "epoch": 0.3564356435643564, "grad_norm": 0.5008177757263184, "kl": 0.006196987873408943, "learning_rate": 1.2869379014989293e-05, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.30935921519994736, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 333 }, { "completion_length": 144.125, "epoch": 0.3575060208723575, "grad_norm": 0.34141579270362854, "kl": 0.0081467991694808, "learning_rate": 1.284796573875803e-05, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.375, "step": 334 }, { "completion_length": 144.125, "epoch": 0.3585763981803586, "grad_norm": 0.16619983315467834, "kl": 0.005714981583878398, "learning_rate": 1.2826552462526766e-05, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.40625, "step": 335 }, { "completion_length": 145.25, "epoch": 0.35964677548835966, "grad_norm": 0.3530811071395874, "kl": 0.006937496364116669, "learning_rate": 1.2805139186295504e-05, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.40625, "step": 336 }, { "completion_length": 141.4375, "epoch": 0.36071715279636074, "grad_norm": 0.44875368475914, "kl": 0.008269356505479664, "learning_rate": 1.2783725910064243e-05, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 337 }, { "completion_length": 138.5625, "epoch": 0.36178753010436177, "grad_norm": 0.2987861633300781, "kl": 0.00601271609775722, "learning_rate": 1.2762312633832978e-05, "loss": 0.0002, "reward": 1.34375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.46875, "step": 338 }, { "completion_length": 145.6875, "epoch": 0.36285790741236285, "grad_norm": 0.2979789674282074, "kl": 0.0065531551372259855, "learning_rate": 1.2740899357601716e-05, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.34375, "step": 339 }, { "completion_length": 144.1875, "epoch": 0.3639282847203639, "grad_norm": 0.2520960569381714, "kl": 0.00900787056889385, "learning_rate": 1.2719486081370451e-05, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 340 }, { "completion_length": 142.9375, "epoch": 0.364998662028365, "grad_norm": 0.352839857339859, "kl": 0.00829396303743124, "learning_rate": 1.2698072805139187e-05, "loss": 0.0003, "reward": 1.25, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 341 }, { "completion_length": 140.75, "epoch": 0.3660690393363661, "grad_norm": 0.2590760588645935, "kl": 0.0075464011169970036, "learning_rate": 1.2676659528907924e-05, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.40625, "step": 342 }, { "completion_length": 138.5625, "epoch": 0.36713941664436717, "grad_norm": 0.30686619877815247, "kl": 0.008231584448367357, "learning_rate": 1.265524625267666e-05, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.46875, "step": 343 }, { "completion_length": 141.5625, "epoch": 0.3682097939523682, "grad_norm": 0.3153855502605438, "kl": 0.00996154174208641, "learning_rate": 1.2633832976445397e-05, "loss": 0.0004, "reward": 1.1875, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 344 }, { "completion_length": 142.1875, "epoch": 0.36928017126036927, "grad_norm": 0.3861086368560791, "kl": 0.009611231740564108, "learning_rate": 1.2612419700214133e-05, "loss": 0.0004, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 345 }, { "completion_length": 147.875, "epoch": 0.37035054856837035, "grad_norm": 0.37777015566825867, "kl": 0.008707606990355998, "learning_rate": 1.259100642398287e-05, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 346 }, { "completion_length": 147.0625, "epoch": 0.37142092587637143, "grad_norm": 0.3429633677005768, "kl": 0.005492302821949124, "learning_rate": 1.2569593147751606e-05, "loss": 0.0002, "reward": 1.375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 347 }, { "completion_length": 128.5625, "epoch": 0.3724913031843725, "grad_norm": 0.35462862253189087, "kl": 0.00605211325455457, "learning_rate": 1.2548179871520343e-05, "loss": 0.0002, "reward": 1.40625, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.9375, "rewards/format_reward_func_qa": 0.46875, "step": 348 }, { "completion_length": 143.3125, "epoch": 0.37356168049237354, "grad_norm": 0.43591928482055664, "kl": 0.011951107706408948, "learning_rate": 1.2526766595289079e-05, "loss": 0.0005, "reward": 0.6875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.3125, "step": 349 }, { "completion_length": 137.0625, "epoch": 0.3746320578003746, "grad_norm": 0.015068719163537025, "kl": 0.0076628910610452294, "learning_rate": 1.2505353319057816e-05, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 350 }, { "completion_length": 142.0625, "epoch": 0.3757024351083757, "grad_norm": 0.5130540728569031, "kl": 0.005498490005265921, "learning_rate": 1.2483940042826555e-05, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 351 }, { "completion_length": 144.875, "epoch": 0.3767728124163768, "grad_norm": 0.20323222875595093, "kl": 0.007479553809389472, "learning_rate": 1.2462526766595291e-05, "loss": 0.0003, "reward": 1.4375, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.9375, "rewards/format_reward_func_qa": 0.5, "step": 352 }, { "completion_length": 145.375, "epoch": 0.37784318972437786, "grad_norm": 0.3844618499279022, "kl": 0.011240929830819368, "learning_rate": 1.2441113490364027e-05, "loss": 0.0004, "reward": 0.9375, "reward_std": 0.4419417232275009, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 353 }, { "completion_length": 145.6875, "epoch": 0.37891356703237894, "grad_norm": 0.14940740168094635, "kl": 0.006145341554656625, "learning_rate": 1.2419700214132764e-05, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.5, "step": 354 }, { "completion_length": 138.625, "epoch": 0.37998394434037996, "grad_norm": 0.24438267946243286, "kl": 0.006054470664821565, "learning_rate": 1.23982869379015e-05, "loss": 0.0002, "reward": 1.03125, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.46875, "step": 355 }, { "completion_length": 138.25, "epoch": 0.38105432164838104, "grad_norm": 0.3079751431941986, "kl": 0.010313314269296825, "learning_rate": 1.2376873661670237e-05, "loss": 0.0004, "reward": 1.09375, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 356 }, { "completion_length": 146.6875, "epoch": 0.3821246989563821, "grad_norm": 0.3538281321525574, "kl": 0.005745593691244721, "learning_rate": 1.2355460385438973e-05, "loss": 0.0002, "reward": 0.875, "reward_std": 0.5303300842642784, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.4375, "step": 357 }, { "completion_length": 150.0, "epoch": 0.3831950762643832, "grad_norm": 0.41295602917671204, "kl": 0.008965130429714918, "learning_rate": 1.233404710920771e-05, "loss": 0.0004, "reward": 1.15625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 358 }, { "completion_length": 144.6875, "epoch": 0.3842654535723843, "grad_norm": 0.20740610361099243, "kl": 0.006285946466960013, "learning_rate": 1.2312633832976446e-05, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 359 }, { "completion_length": 143.625, "epoch": 0.3853358308803853, "grad_norm": 0.3789742588996887, "kl": 0.008684631437063217, "learning_rate": 1.2291220556745183e-05, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 360 }, { "completion_length": 134.625, "epoch": 0.3864062081883864, "grad_norm": 0.40562334656715393, "kl": 0.007627160986885428, "learning_rate": 1.2269807280513919e-05, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 361 }, { "completion_length": 128.75, "epoch": 0.38747658549638747, "grad_norm": 0.440599262714386, "kl": 0.013889113673940301, "learning_rate": 1.2248394004282656e-05, "loss": 0.0006, "reward": 1.09375, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.34375, "step": 362 }, { "completion_length": 145.3125, "epoch": 0.38854696280438855, "grad_norm": 0.4862249493598938, "kl": 0.008840502239763737, "learning_rate": 1.2226980728051392e-05, "loss": 0.0004, "reward": 1.0, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 363 }, { "completion_length": 144.375, "epoch": 0.38961734011238963, "grad_norm": 0.20939357578754425, "kl": 0.007574749062769115, "learning_rate": 1.220556745182013e-05, "loss": 0.0003, "reward": 1.34375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.46875, "step": 364 }, { "completion_length": 142.6875, "epoch": 0.3906877174203907, "grad_norm": 0.25527122616767883, "kl": 0.008040097774937749, "learning_rate": 1.2184154175588866e-05, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 365 }, { "completion_length": 145.125, "epoch": 0.39175809472839174, "grad_norm": 0.4441983103752136, "kl": 0.005440868902951479, "learning_rate": 1.2162740899357604e-05, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.5745242536067963, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.40625, "step": 366 }, { "completion_length": 140.375, "epoch": 0.3928284720363928, "grad_norm": 0.4390992820262909, "kl": 0.009075271780602634, "learning_rate": 1.214132762312634e-05, "loss": 0.0004, "reward": 1.0625, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.5, "step": 367 }, { "completion_length": 145.9375, "epoch": 0.3938988493443939, "grad_norm": 0.19165712594985962, "kl": 0.008053401717916131, "learning_rate": 1.2119914346895077e-05, "loss": 0.0003, "reward": 1.0, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 368 }, { "completion_length": 134.5625, "epoch": 0.394969226652395, "grad_norm": 0.39710742235183716, "kl": 0.007487794617190957, "learning_rate": 1.2098501070663812e-05, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.4419417232275009, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 369 }, { "completion_length": 137.75, "epoch": 0.39603960396039606, "grad_norm": 0.2805812954902649, "kl": 0.005680811242200434, "learning_rate": 1.207708779443255e-05, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 370 }, { "completion_length": 143.4375, "epoch": 0.3971099812683971, "grad_norm": 0.5842120051383972, "kl": 0.010724235326051712, "learning_rate": 1.2055674518201285e-05, "loss": 0.0004, "reward": 0.75, "reward_std": 0.6187184192240238, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.3125, "step": 371 }, { "completion_length": 142.875, "epoch": 0.39818035857639816, "grad_norm": 0.37814757227897644, "kl": 0.007575657917186618, "learning_rate": 1.2034261241970023e-05, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 372 }, { "completion_length": 143.4375, "epoch": 0.39925073588439924, "grad_norm": 0.2070724368095398, "kl": 0.0048540683928877115, "learning_rate": 1.2012847965738758e-05, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.3125, "step": 373 }, { "completion_length": 144.5, "epoch": 0.4003211131924003, "grad_norm": 0.3560534417629242, "kl": 0.00709749455563724, "learning_rate": 1.1991434689507496e-05, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.46875, "step": 374 }, { "completion_length": 133.9375, "epoch": 0.4013914905004014, "grad_norm": 0.010349521413445473, "kl": 0.004944240965414792, "learning_rate": 1.1970021413276231e-05, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 375 }, { "completion_length": 131.1875, "epoch": 0.4024618678084025, "grad_norm": 0.5015347599983215, "kl": 0.006928665563464165, "learning_rate": 1.1948608137044969e-05, "loss": 0.0003, "reward": 1.0, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.5, "step": 376 }, { "completion_length": 137.0625, "epoch": 0.4035322451164035, "grad_norm": 0.30285099148750305, "kl": 0.00569568807259202, "learning_rate": 1.1927194860813704e-05, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 377 }, { "completion_length": 149.0, "epoch": 0.4046026224244046, "grad_norm": 0.22503948211669922, "kl": 0.008137812954373658, "learning_rate": 1.1905781584582443e-05, "loss": 0.0003, "reward": 1.125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.375, "step": 378 }, { "completion_length": 142.5, "epoch": 0.40567299973240567, "grad_norm": 0.3060752749443054, "kl": 0.006309179938398302, "learning_rate": 1.1884368308351179e-05, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.39774756133556366, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 379 }, { "completion_length": 150.0, "epoch": 0.40674337704040675, "grad_norm": 0.1765987128019333, "kl": 0.005777367390692234, "learning_rate": 1.1862955032119916e-05, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 380 }, { "completion_length": 144.875, "epoch": 0.40781375434840783, "grad_norm": 0.45171236991882324, "kl": 0.008916746824979782, "learning_rate": 1.1841541755888652e-05, "loss": 0.0004, "reward": 0.8125, "reward_std": 0.4419417344033718, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.375, "step": 381 }, { "completion_length": 144.8125, "epoch": 0.4088841316564089, "grad_norm": 0.17925989627838135, "kl": 0.007553309260401875, "learning_rate": 1.182012847965739e-05, "loss": 0.0003, "reward": 1.0, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 382 }, { "completion_length": 132.9375, "epoch": 0.40995450896440994, "grad_norm": 0.16423432528972626, "kl": 0.005629725172184408, "learning_rate": 1.1798715203426125e-05, "loss": 0.0002, "reward": 1.125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 383 }, { "completion_length": 128.75, "epoch": 0.411024886272411, "grad_norm": 0.40454113483428955, "kl": 0.008162200916558504, "learning_rate": 1.1777301927194862e-05, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.4419417269527912, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 384 }, { "completion_length": 145.875, "epoch": 0.4120952635804121, "grad_norm": 0.252472847700119, "kl": 0.009246850502677262, "learning_rate": 1.1755888650963598e-05, "loss": 0.0004, "reward": 1.21875, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 385 }, { "completion_length": 133.3125, "epoch": 0.4131656408884132, "grad_norm": 0.4522092938423157, "kl": 0.007653492968529463, "learning_rate": 1.1734475374732335e-05, "loss": 0.0003, "reward": 1.125, "reward_std": 0.3535533882677555, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 386 }, { "completion_length": 141.6875, "epoch": 0.41423601819641426, "grad_norm": 0.3999173939228058, "kl": 0.009492598241195083, "learning_rate": 1.1713062098501071e-05, "loss": 0.0004, "reward": 1.03125, "reward_std": 0.48613590747117996, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 387 }, { "completion_length": 139.5, "epoch": 0.4153063955044153, "grad_norm": 0.43083757162094116, "kl": 0.005691648810170591, "learning_rate": 1.1691648822269808e-05, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 388 }, { "completion_length": 142.125, "epoch": 0.41637677281241636, "grad_norm": 0.22840003669261932, "kl": 0.006524853291921318, "learning_rate": 1.1670235546038544e-05, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 389 }, { "completion_length": 144.0, "epoch": 0.41744715012041744, "grad_norm": 0.16357694566249847, "kl": 0.00781522796023637, "learning_rate": 1.1648822269807281e-05, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.40625, "step": 390 }, { "completion_length": 145.25, "epoch": 0.4185175274284185, "grad_norm": 0.32055553793907166, "kl": 0.00812217709608376, "learning_rate": 1.1627408993576017e-05, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 391 }, { "completion_length": 135.875, "epoch": 0.4195879047364196, "grad_norm": 0.3294346034526825, "kl": 0.004109648056328297, "learning_rate": 1.1605995717344756e-05, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 392 }, { "completion_length": 147.75, "epoch": 0.4206582820444207, "grad_norm": 0.3006425201892853, "kl": 0.007214001612737775, "learning_rate": 1.1584582441113492e-05, "loss": 0.0003, "reward": 0.875, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 393 }, { "completion_length": 139.0, "epoch": 0.4217286593524217, "grad_norm": 0.24566304683685303, "kl": 0.005013052606955171, "learning_rate": 1.1563169164882229e-05, "loss": 0.0002, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 394 }, { "completion_length": 138.0625, "epoch": 0.4227990366604228, "grad_norm": 0.20261645317077637, "kl": 0.007197016500867903, "learning_rate": 1.1541755888650965e-05, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 395 }, { "completion_length": 135.8125, "epoch": 0.42386941396842387, "grad_norm": 0.41987553238868713, "kl": 0.010161266080103815, "learning_rate": 1.1520342612419702e-05, "loss": 0.0004, "reward": 0.875, "reward_std": 0.5303300805389881, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.4375, "step": 396 }, { "completion_length": 134.125, "epoch": 0.42493979127642495, "grad_norm": 0.018983617424964905, "kl": 0.008968915150035173, "learning_rate": 1.1498929336188438e-05, "loss": 0.0004, "reward": 1.0, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 397 }, { "completion_length": 149.5, "epoch": 0.42601016858442603, "grad_norm": 0.1306980848312378, "kl": 0.007947773789055645, "learning_rate": 1.1477516059957175e-05, "loss": 0.0003, "reward": 1.40625, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.9375, "rewards/format_reward_func_qa": 0.46875, "step": 398 }, { "completion_length": 130.5625, "epoch": 0.42708054589242705, "grad_norm": 0.27841174602508545, "kl": 0.006165906437672675, "learning_rate": 1.145610278372591e-05, "loss": 0.0002, "reward": 1.03125, "reward_std": 0.22097086906433105, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 399 }, { "completion_length": 148.3125, "epoch": 0.42815092320042814, "grad_norm": 0.33284708857536316, "kl": 0.004465712641831487, "learning_rate": 1.1434689507494648e-05, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.34375, "step": 400 }, { "completion_length": 129.1875, "epoch": 0.4292213005084292, "grad_norm": 0.26054877042770386, "kl": 0.006159590440802276, "learning_rate": 1.1413276231263384e-05, "loss": 0.0002, "reward": 1.125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 401 }, { "completion_length": 137.0625, "epoch": 0.4302916778164303, "grad_norm": 0.22967737913131714, "kl": 0.01152036536950618, "learning_rate": 1.1391862955032121e-05, "loss": 0.0005, "reward": 1.15625, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.40625, "step": 402 }, { "completion_length": 139.375, "epoch": 0.4313620551244314, "grad_norm": 0.4154336154460907, "kl": 0.006160021992400289, "learning_rate": 1.1370449678800857e-05, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 403 }, { "completion_length": 144.375, "epoch": 0.43243243243243246, "grad_norm": 0.3677040636539459, "kl": 0.00974874512758106, "learning_rate": 1.1349036402569594e-05, "loss": 0.0004, "reward": 1.25, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 404 }, { "completion_length": 137.1875, "epoch": 0.4335028097404335, "grad_norm": 0.43192532658576965, "kl": 0.007872526533901691, "learning_rate": 1.132762312633833e-05, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.48613590747117996, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 405 }, { "completion_length": 135.375, "epoch": 0.43457318704843456, "grad_norm": 0.3646664321422577, "kl": 0.011834080913104117, "learning_rate": 1.1306209850107069e-05, "loss": 0.0005, "reward": 1.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 406 }, { "completion_length": 140.8125, "epoch": 0.43564356435643564, "grad_norm": 0.18076443672180176, "kl": 0.006851482554338872, "learning_rate": 1.1284796573875804e-05, "loss": 0.0003, "reward": 1.0, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 407 }, { "completion_length": 147.4375, "epoch": 0.4367139416644367, "grad_norm": 0.22329428791999817, "kl": 0.0071833484107628465, "learning_rate": 1.1263383297644542e-05, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.34375, "step": 408 }, { "completion_length": 144.5, "epoch": 0.4377843189724378, "grad_norm": 0.1401551216840744, "kl": 0.006534727523103356, "learning_rate": 1.1241970021413277e-05, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 409 }, { "completion_length": 123.75, "epoch": 0.4388546962804388, "grad_norm": 0.3974740505218506, "kl": 0.00892953563015908, "learning_rate": 1.1220556745182015e-05, "loss": 0.0004, "reward": 1.1875, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.5, "step": 410 }, { "completion_length": 142.625, "epoch": 0.4399250735884399, "grad_norm": 0.44294533133506775, "kl": 0.01865853532217443, "learning_rate": 1.119914346895075e-05, "loss": 0.0007, "reward": 1.125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 411 }, { "completion_length": 143.25, "epoch": 0.440995450896441, "grad_norm": 0.22854338586330414, "kl": 0.010148898232728243, "learning_rate": 1.1177730192719488e-05, "loss": 0.0004, "reward": 0.96875, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 412 }, { "completion_length": 133.6875, "epoch": 0.44206582820444207, "grad_norm": 0.1967206746339798, "kl": 0.00770859164185822, "learning_rate": 1.1156316916488223e-05, "loss": 0.0003, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.5, "step": 413 }, { "completion_length": 145.875, "epoch": 0.44313620551244315, "grad_norm": 0.22401538491249084, "kl": 0.005290259723551571, "learning_rate": 1.113490364025696e-05, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.40625, "step": 414 }, { "completion_length": 141.125, "epoch": 0.44420658282044423, "grad_norm": 0.21706151962280273, "kl": 0.00950657797511667, "learning_rate": 1.1113490364025696e-05, "loss": 0.0004, "reward": 0.6875, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.375, "step": 415 }, { "completion_length": 134.75, "epoch": 0.44527696012844525, "grad_norm": 0.3872338831424713, "kl": 0.008573073195293546, "learning_rate": 1.1092077087794434e-05, "loss": 0.0003, "reward": 0.875, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.4375, "step": 416 }, { "completion_length": 146.5, "epoch": 0.44634733743644633, "grad_norm": 0.2254711389541626, "kl": 0.005528193025384098, "learning_rate": 1.107066381156317e-05, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.40625, "step": 417 }, { "completion_length": 141.4375, "epoch": 0.4474177147444474, "grad_norm": 0.34880560636520386, "kl": 0.009334266535006464, "learning_rate": 1.1049250535331907e-05, "loss": 0.0004, "reward": 1.28125, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 418 }, { "completion_length": 142.8125, "epoch": 0.4484880920524485, "grad_norm": 0.3631342053413391, "kl": 0.008129345544148237, "learning_rate": 1.1027837259100644e-05, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 419 }, { "completion_length": 127.375, "epoch": 0.4495584693604496, "grad_norm": 0.31782153248786926, "kl": 0.006236298417206854, "learning_rate": 1.1006423982869381e-05, "loss": 0.0002, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.5, "step": 420 }, { "completion_length": 136.1875, "epoch": 0.45062884666845066, "grad_norm": 0.27864205837249756, "kl": 0.005193412711378187, "learning_rate": 1.0985010706638117e-05, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.375, "step": 421 }, { "completion_length": 147.8125, "epoch": 0.4516992239764517, "grad_norm": 0.16838960349559784, "kl": 0.00500064785592258, "learning_rate": 1.0963597430406854e-05, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 422 }, { "completion_length": 145.6875, "epoch": 0.45276960128445276, "grad_norm": 0.48033925890922546, "kl": 0.007599485106766224, "learning_rate": 1.094218415417559e-05, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 423 }, { "completion_length": 142.8125, "epoch": 0.45383997859245384, "grad_norm": 0.32777833938598633, "kl": 0.004126788582652807, "learning_rate": 1.0920770877944327e-05, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 424 }, { "completion_length": 138.1875, "epoch": 0.4549103559004549, "grad_norm": 0.383306622505188, "kl": 0.009072435786947608, "learning_rate": 1.0899357601713063e-05, "loss": 0.0004, "reward": 1.125, "reward_std": 0.4419417232275009, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.375, "step": 425 }, { "completion_length": 137.125, "epoch": 0.455980733208456, "grad_norm": 0.5520068407058716, "kl": 0.00922621926292777, "learning_rate": 1.08779443254818e-05, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.39774755761027336, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.34375, "step": 426 }, { "completion_length": 134.5, "epoch": 0.457051110516457, "grad_norm": 0.2857801020145416, "kl": 0.007331811357289553, "learning_rate": 1.0856531049250536e-05, "loss": 0.0003, "reward": 1.375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 427 }, { "completion_length": 137.125, "epoch": 0.4581214878244581, "grad_norm": 0.14342139661312103, "kl": 0.011763304937630892, "learning_rate": 1.0835117773019273e-05, "loss": 0.0005, "reward": 1.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.5, "step": 428 }, { "completion_length": 134.5, "epoch": 0.4591918651324592, "grad_norm": 0.2699294686317444, "kl": 0.007278061471879482, "learning_rate": 1.0813704496788009e-05, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.4375, "step": 429 }, { "completion_length": 139.4375, "epoch": 0.46026224244046027, "grad_norm": 0.3437519669532776, "kl": 0.009155996842309833, "learning_rate": 1.0792291220556746e-05, "loss": 0.0004, "reward": 1.125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 430 }, { "completion_length": 135.0, "epoch": 0.46133261974846135, "grad_norm": 0.35517609119415283, "kl": 0.006980703445151448, "learning_rate": 1.0770877944325482e-05, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 431 }, { "completion_length": 144.6875, "epoch": 0.46240299705646243, "grad_norm": 0.2703591585159302, "kl": 0.007698883884586394, "learning_rate": 1.0749464668094217e-05, "loss": 0.0003, "reward": 1.375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 432 }, { "completion_length": 141.8125, "epoch": 0.46347337436446345, "grad_norm": 0.40456750988960266, "kl": 0.0066214584512636065, "learning_rate": 1.0728051391862957e-05, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 433 }, { "completion_length": 128.875, "epoch": 0.46454375167246453, "grad_norm": 0.2466873675584793, "kl": 0.010391404503025115, "learning_rate": 1.0706638115631694e-05, "loss": 0.0004, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 434 }, { "completion_length": 145.5, "epoch": 0.4656141289804656, "grad_norm": 0.26018762588500977, "kl": 0.006977772573009133, "learning_rate": 1.068522483940043e-05, "loss": 0.0003, "reward": 1.25, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 435 }, { "completion_length": 147.25, "epoch": 0.4666845062884667, "grad_norm": 0.37460267543792725, "kl": 0.006530374754220247, "learning_rate": 1.0663811563169167e-05, "loss": 0.0003, "reward": 0.75, "reward_std": 0.3535533770918846, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.3125, "step": 436 }, { "completion_length": 117.125, "epoch": 0.4677548835964678, "grad_norm": 0.2479018121957779, "kl": 0.004997036536224186, "learning_rate": 1.0642398286937903e-05, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 437 }, { "completion_length": 142.625, "epoch": 0.4688252609044688, "grad_norm": 0.40501394867897034, "kl": 0.010519789531826973, "learning_rate": 1.062098501070664e-05, "loss": 0.0004, "reward": 0.9375, "reward_std": 0.3535533770918846, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 438 }, { "completion_length": 126.3125, "epoch": 0.4698956382124699, "grad_norm": 0.3182278573513031, "kl": 0.0063638086430728436, "learning_rate": 1.0599571734475376e-05, "loss": 0.0003, "reward": 1.25, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 439 }, { "completion_length": 139.25, "epoch": 0.47096601552047096, "grad_norm": 0.20441149175167084, "kl": 0.00673819100484252, "learning_rate": 1.0578158458244113e-05, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 440 }, { "completion_length": 133.875, "epoch": 0.47203639282847204, "grad_norm": 0.413339763879776, "kl": 0.007536868681199849, "learning_rate": 1.0556745182012849e-05, "loss": 0.0003, "reward": 1.375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 441 }, { "completion_length": 141.125, "epoch": 0.4731067701364731, "grad_norm": 0.38638320565223694, "kl": 0.008651418378576636, "learning_rate": 1.0535331905781586e-05, "loss": 0.0003, "reward": 1.125, "reward_std": 0.3535533770918846, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 442 }, { "completion_length": 144.125, "epoch": 0.4741771474444742, "grad_norm": 0.333778977394104, "kl": 0.00958932877983898, "learning_rate": 1.0513918629550322e-05, "loss": 0.0004, "reward": 1.03125, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 443 }, { "completion_length": 148.6875, "epoch": 0.4752475247524752, "grad_norm": 0.35525214672088623, "kl": 0.008321229775901884, "learning_rate": 1.0492505353319057e-05, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.40625, "step": 444 }, { "completion_length": 148.0, "epoch": 0.4763179020604763, "grad_norm": 0.36049333214759827, "kl": 0.005807650682982057, "learning_rate": 1.0471092077087794e-05, "loss": 0.0002, "reward": 1.03125, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 445 }, { "completion_length": 138.3125, "epoch": 0.4773882793684774, "grad_norm": 0.2663690149784088, "kl": 0.008832648396492004, "learning_rate": 1.044967880085653e-05, "loss": 0.0004, "reward": 1.28125, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 446 }, { "completion_length": 145.125, "epoch": 0.47845865667647847, "grad_norm": 0.4534795880317688, "kl": 0.011585065280087292, "learning_rate": 1.042826552462527e-05, "loss": 0.0005, "reward": 0.71875, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.34375, "step": 447 }, { "completion_length": 140.0, "epoch": 0.47952903398447955, "grad_norm": 0.3728805482387543, "kl": 0.013883272767998278, "learning_rate": 1.0406852248394007e-05, "loss": 0.0006, "reward": 0.875, "reward_std": 0.5303300768136978, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.4375, "step": 448 }, { "completion_length": 136.4375, "epoch": 0.48059941129248057, "grad_norm": 0.35197046399116516, "kl": 0.007696510059759021, "learning_rate": 1.0385438972162742e-05, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.28125, "step": 449 }, { "completion_length": 150.0, "epoch": 0.48166978860048165, "grad_norm": 0.25776129961013794, "kl": 0.007930970285087824, "learning_rate": 1.036402569593148e-05, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 450 }, { "completion_length": 134.8125, "epoch": 0.48274016590848273, "grad_norm": 0.23979267477989197, "kl": 0.009117277164477855, "learning_rate": 1.0342612419700215e-05, "loss": 0.0004, "reward": 1.09375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 451 }, { "completion_length": 149.375, "epoch": 0.4838105432164838, "grad_norm": 0.3924581706523895, "kl": 0.01353934919461608, "learning_rate": 1.0321199143468953e-05, "loss": 0.0005, "reward": 1.125, "reward_std": 0.3535533882677555, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.375, "step": 452 }, { "completion_length": 150.0, "epoch": 0.4848809205244849, "grad_norm": 0.23874813318252563, "kl": 0.010358783532865345, "learning_rate": 1.0299785867237688e-05, "loss": 0.0004, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.5, "step": 453 }, { "completion_length": 145.6875, "epoch": 0.485951297832486, "grad_norm": 0.2809109091758728, "kl": 0.0060470933094620705, "learning_rate": 1.0278372591006426e-05, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.40625, "step": 454 }, { "completion_length": 137.5625, "epoch": 0.487021675140487, "grad_norm": 0.14426939189434052, "kl": 0.005893114197533578, "learning_rate": 1.0256959314775161e-05, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 455 }, { "completion_length": 136.875, "epoch": 0.4880920524484881, "grad_norm": 0.44749608635902405, "kl": 0.012064934242516756, "learning_rate": 1.0235546038543897e-05, "loss": 0.0005, "reward": 0.90625, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 456 }, { "completion_length": 136.375, "epoch": 0.48916242975648916, "grad_norm": 0.3581511974334717, "kl": 0.008633325574919581, "learning_rate": 1.0214132762312634e-05, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 457 }, { "completion_length": 137.8125, "epoch": 0.49023280706449024, "grad_norm": 0.23743630945682526, "kl": 0.010246596415527165, "learning_rate": 1.019271948608137e-05, "loss": 0.0004, "reward": 1.28125, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 458 }, { "completion_length": 138.0625, "epoch": 0.4913031843724913, "grad_norm": 0.35530969500541687, "kl": 0.007755730650387704, "learning_rate": 1.0171306209850107e-05, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.4375, "step": 459 }, { "completion_length": 142.375, "epoch": 0.4923735616804924, "grad_norm": 0.28923481702804565, "kl": 0.008694320800714195, "learning_rate": 1.0149892933618843e-05, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 460 }, { "completion_length": 141.75, "epoch": 0.4934439389884934, "grad_norm": 0.3511667251586914, "kl": 0.010940466541796923, "learning_rate": 1.0128479657387582e-05, "loss": 0.0004, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 461 }, { "completion_length": 149.6875, "epoch": 0.4945143162964945, "grad_norm": 0.2515261471271515, "kl": 0.01002546539530158, "learning_rate": 1.010706638115632e-05, "loss": 0.0004, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 462 }, { "completion_length": 141.25, "epoch": 0.4955846936044956, "grad_norm": 0.34333762526512146, "kl": 0.00718439748743549, "learning_rate": 1.0085653104925055e-05, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 463 }, { "completion_length": 133.6875, "epoch": 0.49665507091249667, "grad_norm": 0.2072678953409195, "kl": 0.00659529457334429, "learning_rate": 1.0064239828693792e-05, "loss": 0.0003, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 464 }, { "completion_length": 136.3125, "epoch": 0.49772544822049775, "grad_norm": 0.43942439556121826, "kl": 0.00934528629295528, "learning_rate": 1.0042826552462528e-05, "loss": 0.0004, "reward": 0.96875, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 465 }, { "completion_length": 149.0, "epoch": 0.49879582552849877, "grad_norm": 0.23048676550388336, "kl": 0.005269995279377326, "learning_rate": 1.0021413276231265e-05, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 466 }, { "completion_length": 132.0625, "epoch": 0.49986620283649985, "grad_norm": 0.4228231906890869, "kl": 0.015294217213522643, "learning_rate": 1e-05, "loss": 0.0006, "reward": 1.0625, "reward_std": 0.4419417232275009, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 467 }, { "completion_length": 140.25, "epoch": 0.5009365801445009, "grad_norm": 0.259385883808136, "kl": 0.005996028776280582, "learning_rate": 9.978586723768736e-06, "loss": 0.0002, "reward": 0.875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 468 }, { "completion_length": 147.125, "epoch": 0.502006957452502, "grad_norm": 0.30805516242980957, "kl": 0.011259513208642602, "learning_rate": 9.957173447537474e-06, "loss": 0.0005, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 469 }, { "completion_length": 147.0, "epoch": 0.5030773347605031, "grad_norm": 0.3703915774822235, "kl": 0.010803711600601673, "learning_rate": 9.93576017130621e-06, "loss": 0.0004, "reward": 1.28125, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 470 }, { "completion_length": 146.0625, "epoch": 0.5041477120685042, "grad_norm": 0.33139583468437195, "kl": 0.012623719405382872, "learning_rate": 9.914346895074949e-06, "loss": 0.0005, "reward": 1.1875, "reward_std": 0.2651650421321392, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 471 }, { "completion_length": 122.75, "epoch": 0.5052180893765053, "grad_norm": 0.2508881390094757, "kl": 0.008042589761316776, "learning_rate": 9.892933618843684e-06, "loss": 0.0003, "reward": 1.4375, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.9375, "rewards/format_reward_func_qa": 0.5, "step": 472 }, { "completion_length": 135.3125, "epoch": 0.5062884666845063, "grad_norm": 0.2330748587846756, "kl": 0.00703393726143986, "learning_rate": 9.871520342612421e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 473 }, { "completion_length": 117.375, "epoch": 0.5073588439925074, "grad_norm": 0.4916565716266632, "kl": 0.01852000062353909, "learning_rate": 9.850107066381157e-06, "loss": 0.0007, "reward": 0.96875, "reward_std": 0.5745242536067963, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 474 }, { "completion_length": 139.875, "epoch": 0.5084292213005084, "grad_norm": 0.43320146203041077, "kl": 0.00934557057917118, "learning_rate": 9.828693790149893e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.40625, "step": 475 }, { "completion_length": 150.0, "epoch": 0.5094995986085095, "grad_norm": 0.33068451285362244, "kl": 0.00842006946913898, "learning_rate": 9.80728051391863e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.4375, "step": 476 }, { "completion_length": 120.5, "epoch": 0.5105699759165105, "grad_norm": 0.2756935656070709, "kl": 0.009832501178607345, "learning_rate": 9.785867237687366e-06, "loss": 0.0004, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.5, "step": 477 }, { "completion_length": 150.0, "epoch": 0.5116403532245116, "grad_norm": 0.3408316671848297, "kl": 0.01131543260999024, "learning_rate": 9.764453961456105e-06, "loss": 0.0005, "reward": 0.90625, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 478 }, { "completion_length": 144.8125, "epoch": 0.5127107305325127, "grad_norm": 0.301351398229599, "kl": 0.007481154287233949, "learning_rate": 9.74304068522484e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 479 }, { "completion_length": 134.125, "epoch": 0.5137811078405138, "grad_norm": 0.26906123757362366, "kl": 0.010642150649800897, "learning_rate": 9.721627408993576e-06, "loss": 0.0004, "reward": 1.15625, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 480 }, { "completion_length": 149.1875, "epoch": 0.5148514851485149, "grad_norm": 0.3382323682308197, "kl": 0.008608122589066625, "learning_rate": 9.700214132762313e-06, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 481 }, { "completion_length": 125.6875, "epoch": 0.515921862456516, "grad_norm": 0.3092842996120453, "kl": 0.007361217867583036, "learning_rate": 9.678800856531049e-06, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.40625, "step": 482 }, { "completion_length": 136.5625, "epoch": 0.516992239764517, "grad_norm": 0.01267006155103445, "kl": 0.009871136397123337, "learning_rate": 9.657387580299786e-06, "loss": 0.0004, "reward": 1.375, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 483 }, { "completion_length": 150.0, "epoch": 0.5180626170725181, "grad_norm": 0.3303705155849457, "kl": 0.007267375709488988, "learning_rate": 9.635974304068522e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.48613589257001877, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.28125, "step": 484 }, { "completion_length": 136.5625, "epoch": 0.5191329943805192, "grad_norm": 0.011883686296641827, "kl": 0.007186895818449557, "learning_rate": 9.614561027837261e-06, "loss": 0.0003, "reward": 1.5, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 1.0, "rewards/format_reward_func_qa": 0.5, "step": 485 }, { "completion_length": 132.375, "epoch": 0.5202033716885202, "grad_norm": 0.3317205309867859, "kl": 0.011423196760006249, "learning_rate": 9.593147751605997e-06, "loss": 0.0005, "reward": 0.90625, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.46875, "step": 486 }, { "completion_length": 147.5, "epoch": 0.5212737489965212, "grad_norm": 0.35617393255233765, "kl": 0.007501334534026682, "learning_rate": 9.571734475374732e-06, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.2651650309562683, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 487 }, { "completion_length": 141.0, "epoch": 0.5223441263045223, "grad_norm": 0.27668678760528564, "kl": 0.008534971857443452, "learning_rate": 9.55032119914347e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.3535533770918846, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.375, "step": 488 }, { "completion_length": 150.0, "epoch": 0.5234145036125234, "grad_norm": 0.5206948518753052, "kl": 0.013087153085507452, "learning_rate": 9.528907922912205e-06, "loss": 0.0005, "reward": 0.625, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.25, "rewards/format_reward_func_qa": 0.375, "step": 489 }, { "completion_length": 141.75, "epoch": 0.5244848809205245, "grad_norm": 0.007866663858294487, "kl": 0.004131860565394163, "learning_rate": 9.507494646680943e-06, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 490 }, { "completion_length": 143.8125, "epoch": 0.5255552582285256, "grad_norm": 0.32336246967315674, "kl": 0.008200255630072206, "learning_rate": 9.486081370449678e-06, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 491 }, { "completion_length": 143.125, "epoch": 0.5266256355365266, "grad_norm": 0.38344046473503113, "kl": 0.009465484065003693, "learning_rate": 9.464668094218416e-06, "loss": 0.0004, "reward": 1.0625, "reward_std": 0.4419417344033718, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 492 }, { "completion_length": 136.875, "epoch": 0.5276960128445277, "grad_norm": 0.4057978093624115, "kl": 0.007756530423648655, "learning_rate": 9.443254817987153e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 493 }, { "completion_length": 150.0, "epoch": 0.5287663901525288, "grad_norm": 0.5384769439697266, "kl": 0.008908328833058476, "learning_rate": 9.421841541755889e-06, "loss": 0.0004, "reward": 0.9375, "reward_std": 0.5303300693631172, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.4375, "step": 494 }, { "completion_length": 140.75, "epoch": 0.5298367674605299, "grad_norm": 0.5602743029594421, "kl": 0.008926091133616865, "learning_rate": 9.400428265524626e-06, "loss": 0.0004, "reward": 1.09375, "reward_std": 0.5745242536067963, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 495 }, { "completion_length": 123.5625, "epoch": 0.530907144768531, "grad_norm": 0.3694626986980438, "kl": 0.01632965775206685, "learning_rate": 9.379014989293362e-06, "loss": 0.0007, "reward": 1.09375, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 496 }, { "completion_length": 137.0625, "epoch": 0.5319775220765319, "grad_norm": 0.43530595302581787, "kl": 0.007556148339062929, "learning_rate": 9.357601713062099e-06, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.48613590002059937, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.46875, "step": 497 }, { "completion_length": 140.875, "epoch": 0.533047899384533, "grad_norm": 0.361478716135025, "kl": 0.007779664359986782, "learning_rate": 9.336188436830836e-06, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 498 }, { "completion_length": 143.8125, "epoch": 0.5341182766925341, "grad_norm": 0.22836001217365265, "kl": 0.013356427545659244, "learning_rate": 9.314775160599572e-06, "loss": 0.0005, "reward": 1.28125, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.40625, "step": 499 }, { "completion_length": 139.3125, "epoch": 0.5351886540005352, "grad_norm": 0.2930338978767395, "kl": 0.00791727239266038, "learning_rate": 9.29336188436831e-06, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 500 }, { "completion_length": 139.25, "epoch": 0.5362590313085362, "grad_norm": 0.14838145673274994, "kl": 0.009750527096912265, "learning_rate": 9.271948608137045e-06, "loss": 0.0004, "reward": 1.3125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 501 }, { "completion_length": 132.75, "epoch": 0.5373294086165373, "grad_norm": 0.3404102325439453, "kl": 0.009338638396002352, "learning_rate": 9.250535331905782e-06, "loss": 0.0004, "reward": 1.03125, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.46875, "step": 502 }, { "completion_length": 124.9375, "epoch": 0.5383997859245384, "grad_norm": 0.21074822545051575, "kl": 0.011723292875103652, "learning_rate": 9.229122055674518e-06, "loss": 0.0005, "reward": 1.28125, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 503 }, { "completion_length": 145.6875, "epoch": 0.5394701632325395, "grad_norm": 0.30406832695007324, "kl": 0.008067434129770845, "learning_rate": 9.207708779443255e-06, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.46875, "step": 504 }, { "completion_length": 125.5, "epoch": 0.5405405405405406, "grad_norm": 0.46522197127342224, "kl": 0.009590548230335116, "learning_rate": 9.186295503211993e-06, "loss": 0.0004, "reward": 1.28125, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 505 }, { "completion_length": 136.375, "epoch": 0.5416109178485417, "grad_norm": 0.22833353281021118, "kl": 0.008472877787426114, "learning_rate": 9.164882226980728e-06, "loss": 0.0003, "reward": 1.125, "reward_std": 0.2651650309562683, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.375, "step": 506 }, { "completion_length": 138.4375, "epoch": 0.5426812951565427, "grad_norm": 0.16364328563213348, "kl": 0.00716752465814352, "learning_rate": 9.143468950749466e-06, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.5, "step": 507 }, { "completion_length": 133.0, "epoch": 0.5437516724645437, "grad_norm": 0.42439737915992737, "kl": 0.01228869054466486, "learning_rate": 9.122055674518201e-06, "loss": 0.0005, "reward": 1.15625, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 508 }, { "completion_length": 141.75, "epoch": 0.5448220497725448, "grad_norm": 0.1622205674648285, "kl": 0.014454575954005122, "learning_rate": 9.100642398286939e-06, "loss": 0.0006, "reward": 0.96875, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.34375, "step": 509 }, { "completion_length": 137.1875, "epoch": 0.5458924270805459, "grad_norm": 0.2893645167350769, "kl": 0.01325001590885222, "learning_rate": 9.079229122055674e-06, "loss": 0.0005, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 510 }, { "completion_length": 131.25, "epoch": 0.5469628043885469, "grad_norm": 0.31013864278793335, "kl": 0.014093953999690711, "learning_rate": 9.057815845824412e-06, "loss": 0.0006, "reward": 0.90625, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 511 }, { "completion_length": 131.6875, "epoch": 0.548033181696548, "grad_norm": 0.012186127714812756, "kl": 0.008404308697208762, "learning_rate": 9.036402569593149e-06, "loss": 0.0003, "reward": 1.375, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 512 }, { "completion_length": 134.375, "epoch": 0.5491035590045491, "grad_norm": 0.4123871326446533, "kl": 0.007007199979852885, "learning_rate": 9.014989293361885e-06, "loss": 0.0003, "reward": 1.25, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 513 }, { "completion_length": 117.1875, "epoch": 0.5501739363125502, "grad_norm": 0.30458545684814453, "kl": 0.015905787469819188, "learning_rate": 8.993576017130622e-06, "loss": 0.0006, "reward": 1.15625, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 514 }, { "completion_length": 141.25, "epoch": 0.5512443136205513, "grad_norm": 0.46298947930336, "kl": 0.011632127338089049, "learning_rate": 8.972162740899358e-06, "loss": 0.0005, "reward": 1.21875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 515 }, { "completion_length": 141.875, "epoch": 0.5523146909285523, "grad_norm": 0.3965262174606323, "kl": 0.009626075625419617, "learning_rate": 8.950749464668095e-06, "loss": 0.0004, "reward": 0.8125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.375, "step": 516 }, { "completion_length": 133.4375, "epoch": 0.5533850682365534, "grad_norm": 0.28009626269340515, "kl": 0.01591487549012527, "learning_rate": 8.92933618843683e-06, "loss": 0.0006, "reward": 1.34375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.46875, "step": 517 }, { "completion_length": 149.125, "epoch": 0.5544554455445545, "grad_norm": 0.44649243354797363, "kl": 0.009925358463078737, "learning_rate": 8.907922912205568e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 518 }, { "completion_length": 146.25, "epoch": 0.5555258228525555, "grad_norm": 0.18165594339370728, "kl": 0.00819373200647533, "learning_rate": 8.886509635974305e-06, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 519 }, { "completion_length": 147.375, "epoch": 0.5565962001605566, "grad_norm": 0.4177275002002716, "kl": 0.008600716711953282, "learning_rate": 8.865096359743041e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.48613590002059937, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.46875, "step": 520 }, { "completion_length": 135.375, "epoch": 0.5576665774685576, "grad_norm": 0.21288467943668365, "kl": 0.00567637593485415, "learning_rate": 8.843683083511778e-06, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 521 }, { "completion_length": 133.3125, "epoch": 0.5587369547765587, "grad_norm": 0.35557377338409424, "kl": 0.007055180089082569, "learning_rate": 8.822269807280514e-06, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.5, "step": 522 }, { "completion_length": 131.25, "epoch": 0.5598073320845598, "grad_norm": 0.3630206882953644, "kl": 0.006192219443619251, "learning_rate": 8.800856531049251e-06, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.3535533882677555, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 523 }, { "completion_length": 120.1875, "epoch": 0.5608777093925609, "grad_norm": 0.3251684010028839, "kl": 0.009179081069305539, "learning_rate": 8.779443254817987e-06, "loss": 0.0004, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 524 }, { "completion_length": 138.3125, "epoch": 0.561948086700562, "grad_norm": 0.37668687105178833, "kl": 0.00513340113684535, "learning_rate": 8.758029978586724e-06, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.4419417344033718, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 525 }, { "completion_length": 145.0625, "epoch": 0.563018464008563, "grad_norm": 0.315995991230011, "kl": 0.008313555736094713, "learning_rate": 8.736616702355462e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 526 }, { "completion_length": 135.1875, "epoch": 0.5640888413165641, "grad_norm": 0.18722151219844818, "kl": 0.00760848238132894, "learning_rate": 8.715203426124197e-06, "loss": 0.0003, "reward": 1.3125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 527 }, { "completion_length": 140.1875, "epoch": 0.5651592186245652, "grad_norm": 0.4217666983604431, "kl": 0.007592201582156122, "learning_rate": 8.693790149892935e-06, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.48613590747117996, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.46875, "step": 528 }, { "completion_length": 136.0, "epoch": 0.5662295959325663, "grad_norm": 0.23941150307655334, "kl": 0.006432738737203181, "learning_rate": 8.67237687366167e-06, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 529 }, { "completion_length": 124.0625, "epoch": 0.5672999732405672, "grad_norm": 0.3700920045375824, "kl": 0.012712378636933863, "learning_rate": 8.650963597430408e-06, "loss": 0.0005, "reward": 1.3125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 530 }, { "completion_length": 139.3125, "epoch": 0.5683703505485683, "grad_norm": 0.012443755753338337, "kl": 0.007858548837248236, "learning_rate": 8.629550321199143e-06, "loss": 0.0003, "reward": 1.125, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.375, "step": 531 }, { "completion_length": 143.4375, "epoch": 0.5694407278565694, "grad_norm": 0.3603968918323517, "kl": 0.010017989436164498, "learning_rate": 8.60813704496788e-06, "loss": 0.0004, "reward": 1.1875, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 532 }, { "completion_length": 134.9375, "epoch": 0.5705111051645705, "grad_norm": 0.3105698227882385, "kl": 0.008955774828791618, "learning_rate": 8.586723768736618e-06, "loss": 0.0004, "reward": 1.09375, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 533 }, { "completion_length": 135.25, "epoch": 0.5715814824725716, "grad_norm": 0.49610576033592224, "kl": 0.010118793696165085, "learning_rate": 8.565310492505354e-06, "loss": 0.0004, "reward": 1.09375, "reward_std": 0.5745242349803448, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.34375, "step": 534 }, { "completion_length": 149.75, "epoch": 0.5726518597805726, "grad_norm": 0.32460343837738037, "kl": 0.00822330906521529, "learning_rate": 8.543897216274091e-06, "loss": 0.0003, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 535 }, { "completion_length": 128.875, "epoch": 0.5737222370885737, "grad_norm": 0.39197736978530884, "kl": 0.007472209166735411, "learning_rate": 8.522483940042827e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.4419417195022106, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.3125, "step": 536 }, { "completion_length": 140.125, "epoch": 0.5747926143965748, "grad_norm": 0.32253098487854004, "kl": 0.008819866925477982, "learning_rate": 8.501070663811564e-06, "loss": 0.0004, "reward": 1.28125, "reward_std": 0.30935921519994736, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 537 }, { "completion_length": 143.6875, "epoch": 0.5758629917045759, "grad_norm": 0.2873716354370117, "kl": 0.007969585945829749, "learning_rate": 8.4796573875803e-06, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 538 }, { "completion_length": 146.125, "epoch": 0.576933369012577, "grad_norm": 0.40148022770881653, "kl": 0.009044699021615088, "learning_rate": 8.458244111349037e-06, "loss": 0.0004, "reward": 1.0, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 539 }, { "completion_length": 141.5625, "epoch": 0.578003746320578, "grad_norm": 0.4167746603488922, "kl": 0.008393046853598207, "learning_rate": 8.436830835117774e-06, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.39774755761027336, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 540 }, { "completion_length": 144.1875, "epoch": 0.5790741236285791, "grad_norm": 0.4090927243232727, "kl": 0.008535318891517818, "learning_rate": 8.41541755888651e-06, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 541 }, { "completion_length": 129.25, "epoch": 0.5801445009365801, "grad_norm": 0.2550817131996155, "kl": 0.008329204516485333, "learning_rate": 8.394004282655247e-06, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 542 }, { "completion_length": 123.0625, "epoch": 0.5812148782445812, "grad_norm": 0.39355388283729553, "kl": 0.01337734708795324, "learning_rate": 8.372591006423983e-06, "loss": 0.0005, "reward": 0.84375, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 543 }, { "completion_length": 129.3125, "epoch": 0.5822852555525823, "grad_norm": 0.016425615176558495, "kl": 0.007993116625584662, "learning_rate": 8.35117773019272e-06, "loss": 0.0003, "reward": 1.25, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 544 }, { "completion_length": 126.25, "epoch": 0.5833556328605833, "grad_norm": 0.33168816566467285, "kl": 0.01170782302506268, "learning_rate": 8.329764453961456e-06, "loss": 0.0005, "reward": 0.96875, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.34375, "step": 545 }, { "completion_length": 124.875, "epoch": 0.5844260101685844, "grad_norm": 0.3020273745059967, "kl": 0.008699746569618583, "learning_rate": 8.308351177730193e-06, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 546 }, { "completion_length": 135.5625, "epoch": 0.5854963874765855, "grad_norm": 0.3004071116447449, "kl": 0.008066077600233257, "learning_rate": 8.28693790149893e-06, "loss": 0.0003, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 547 }, { "completion_length": 138.5625, "epoch": 0.5865667647845866, "grad_norm": 0.560327410697937, "kl": 0.017379946541041136, "learning_rate": 8.265524625267666e-06, "loss": 0.0007, "reward": 0.96875, "reward_std": 0.5745242461562157, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.46875, "step": 548 }, { "completion_length": 136.875, "epoch": 0.5876371420925877, "grad_norm": 0.09763245284557343, "kl": 0.005846530548296869, "learning_rate": 8.244111349036404e-06, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 549 }, { "completion_length": 137.75, "epoch": 0.5887075194005887, "grad_norm": 0.34293675422668457, "kl": 0.012747644213959575, "learning_rate": 8.22269807280514e-06, "loss": 0.0005, "reward": 0.71875, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.40625, "step": 550 }, { "completion_length": 138.5625, "epoch": 0.5897778967085898, "grad_norm": 0.3456750214099884, "kl": 0.0048185313353315, "learning_rate": 8.201284796573877e-06, "loss": 0.0002, "reward": 1.125, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 551 }, { "completion_length": 139.3125, "epoch": 0.5908482740165909, "grad_norm": 0.2401307374238968, "kl": 0.00872652349062264, "learning_rate": 8.179871520342612e-06, "loss": 0.0003, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 552 }, { "completion_length": 124.9375, "epoch": 0.5919186513245919, "grad_norm": 0.11673422902822495, "kl": 0.00866595667321235, "learning_rate": 8.15845824411135e-06, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 553 }, { "completion_length": 133.8125, "epoch": 0.592989028632593, "grad_norm": 0.3099370002746582, "kl": 0.010131070390343666, "learning_rate": 8.137044967880087e-06, "loss": 0.0004, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 554 }, { "completion_length": 135.5, "epoch": 0.594059405940594, "grad_norm": 0.21149253845214844, "kl": 0.006644633715040982, "learning_rate": 8.115631691648823e-06, "loss": 0.0003, "reward": 1.3125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 555 }, { "completion_length": 129.125, "epoch": 0.5951297832485951, "grad_norm": 0.26670601963996887, "kl": 0.007839431404136121, "learning_rate": 8.09421841541756e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.375, "step": 556 }, { "completion_length": 140.0625, "epoch": 0.5962001605565962, "grad_norm": 0.265605092048645, "kl": 0.007405390962958336, "learning_rate": 8.072805139186296e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.375, "step": 557 }, { "completion_length": 145.4375, "epoch": 0.5972705378645973, "grad_norm": 0.38561519980430603, "kl": 0.012413175776600838, "learning_rate": 8.051391862955033e-06, "loss": 0.0005, "reward": 1.0625, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 558 }, { "completion_length": 130.125, "epoch": 0.5983409151725984, "grad_norm": 0.27508124709129333, "kl": 0.007573126349598169, "learning_rate": 8.029978586723769e-06, "loss": 0.0003, "reward": 1.375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 559 }, { "completion_length": 150.0, "epoch": 0.5994112924805994, "grad_norm": 0.33988308906555176, "kl": 0.0077141954097896814, "learning_rate": 8.008565310492506e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 560 }, { "completion_length": 127.25, "epoch": 0.6004816697886005, "grad_norm": 0.3382355272769928, "kl": 0.008031048870179802, "learning_rate": 7.987152034261243e-06, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 561 }, { "completion_length": 115.3125, "epoch": 0.6015520470966016, "grad_norm": 0.37109506130218506, "kl": 0.015923229278996587, "learning_rate": 7.965738758029979e-06, "loss": 0.0006, "reward": 1.0, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 562 }, { "completion_length": 134.375, "epoch": 0.6026224244046027, "grad_norm": 0.6142891049385071, "kl": 0.00813570327591151, "learning_rate": 7.944325481798716e-06, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.48613590002059937, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 563 }, { "completion_length": 134.625, "epoch": 0.6036928017126036, "grad_norm": 0.2082885205745697, "kl": 0.007123142713680863, "learning_rate": 7.922912205567452e-06, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 564 }, { "completion_length": 136.375, "epoch": 0.6047631790206047, "grad_norm": 0.46369668841362, "kl": 0.01389736874261871, "learning_rate": 7.90149892933619e-06, "loss": 0.0006, "reward": 0.90625, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 565 }, { "completion_length": 139.1875, "epoch": 0.6058335563286058, "grad_norm": 0.3680477440357208, "kl": 0.017916168551892042, "learning_rate": 7.880085653104925e-06, "loss": 0.0007, "reward": 1.125, "reward_std": 0.3535533770918846, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 566 }, { "completion_length": 132.0, "epoch": 0.6069039336366069, "grad_norm": 0.19841904938220978, "kl": 0.006071976851671934, "learning_rate": 7.858672376873662e-06, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 567 }, { "completion_length": 134.0, "epoch": 0.607974310944608, "grad_norm": 0.3956359326839447, "kl": 0.006953381933271885, "learning_rate": 7.8372591006424e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 568 }, { "completion_length": 143.6875, "epoch": 0.609044688252609, "grad_norm": 0.2433217614889145, "kl": 0.00763099838513881, "learning_rate": 7.815845824411135e-06, "loss": 0.0003, "reward": 1.375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 569 }, { "completion_length": 127.625, "epoch": 0.6101150655606101, "grad_norm": 0.25940757989883423, "kl": 0.010402272455394268, "learning_rate": 7.794432548179873e-06, "loss": 0.0004, "reward": 1.21875, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.40625, "step": 570 }, { "completion_length": 141.5625, "epoch": 0.6111854428686112, "grad_norm": 0.19346284866333008, "kl": 0.007218311016913503, "learning_rate": 7.773019271948608e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.2651650309562683, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.4375, "step": 571 }, { "completion_length": 144.625, "epoch": 0.6122558201766123, "grad_norm": 0.009141452610492706, "kl": 0.007930459338240325, "learning_rate": 7.751605995717346e-06, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 572 }, { "completion_length": 126.5625, "epoch": 0.6133261974846134, "grad_norm": 0.16032709181308746, "kl": 0.006476277427282184, "learning_rate": 7.730192719486081e-06, "loss": 0.0003, "reward": 1.125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 573 }, { "completion_length": 144.5625, "epoch": 0.6143965747926144, "grad_norm": 0.3314836919307709, "kl": 0.007180300424806774, "learning_rate": 7.708779443254819e-06, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 574 }, { "completion_length": 144.25, "epoch": 0.6154669521006154, "grad_norm": 0.40380364656448364, "kl": 0.008394909033086151, "learning_rate": 7.687366167023556e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.34375, "step": 575 }, { "completion_length": 135.6875, "epoch": 0.6165373294086165, "grad_norm": 0.014081945642828941, "kl": 0.008023535134270787, "learning_rate": 7.665952890792292e-06, "loss": 0.0003, "reward": 1.25, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 576 }, { "completion_length": 135.0, "epoch": 0.6176077067166176, "grad_norm": 0.32331156730651855, "kl": 0.008745613391511142, "learning_rate": 7.644539614561029e-06, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 577 }, { "completion_length": 124.375, "epoch": 0.6186780840246187, "grad_norm": 0.17762035131454468, "kl": 0.011062865611165762, "learning_rate": 7.623126338329765e-06, "loss": 0.0004, "reward": 1.125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 578 }, { "completion_length": 131.125, "epoch": 0.6197484613326197, "grad_norm": 0.3630572557449341, "kl": 0.03172519465442747, "learning_rate": 7.601713062098501e-06, "loss": 0.0013, "reward": 1.34375, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.46875, "step": 579 }, { "completion_length": 129.8125, "epoch": 0.6208188386406208, "grad_norm": 0.13417424261569977, "kl": 0.007421527290716767, "learning_rate": 7.580299785867238e-06, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 580 }, { "completion_length": 128.75, "epoch": 0.6218892159486219, "grad_norm": 0.024504827335476875, "kl": 0.013772662030532956, "learning_rate": 7.558886509635975e-06, "loss": 0.0006, "reward": 1.3125, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.4375, "step": 581 }, { "completion_length": 143.5625, "epoch": 0.622959593256623, "grad_norm": 0.15142691135406494, "kl": 0.00681332778185606, "learning_rate": 7.5374732334047115e-06, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 582 }, { "completion_length": 119.6875, "epoch": 0.6240299705646241, "grad_norm": 0.2795710563659668, "kl": 0.007305014180019498, "learning_rate": 7.516059957173448e-06, "loss": 0.0003, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.5, "step": 583 }, { "completion_length": 135.5, "epoch": 0.6251003478726251, "grad_norm": 0.3001692593097687, "kl": 0.007323737954720855, "learning_rate": 7.4946466809421845e-06, "loss": 0.0003, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.5, "step": 584 }, { "completion_length": 117.1875, "epoch": 0.6261707251806262, "grad_norm": 0.20667538046836853, "kl": 0.01834630419034511, "learning_rate": 7.473233404710921e-06, "loss": 0.0007, "reward": 1.34375, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.46875, "step": 585 }, { "completion_length": 121.75, "epoch": 0.6272411024886272, "grad_norm": 0.28877851366996765, "kl": 0.012314608437009156, "learning_rate": 7.4518201284796575e-06, "loss": 0.0005, "reward": 0.90625, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 586 }, { "completion_length": 136.0625, "epoch": 0.6283114797966283, "grad_norm": 0.3191048800945282, "kl": 0.022442583227530122, "learning_rate": 7.430406852248394e-06, "loss": 0.0009, "reward": 1.25, "reward_std": 0.2651650309562683, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.375, "step": 587 }, { "completion_length": 122.25, "epoch": 0.6293818571046293, "grad_norm": 0.3955375850200653, "kl": 0.011896957294084132, "learning_rate": 7.408993576017131e-06, "loss": 0.0005, "reward": 1.125, "reward_std": 0.5303300768136978, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 588 }, { "completion_length": 137.75, "epoch": 0.6304522344126304, "grad_norm": 0.3016032576560974, "kl": 0.008298300846945494, "learning_rate": 7.387580299785868e-06, "loss": 0.0003, "reward": 1.125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 589 }, { "completion_length": 134.875, "epoch": 0.6315226117206315, "grad_norm": 0.32232144474983215, "kl": 0.007292462571058422, "learning_rate": 7.366167023554604e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.4375, "step": 590 }, { "completion_length": 146.0625, "epoch": 0.6325929890286326, "grad_norm": 0.28554344177246094, "kl": 0.005859751836396754, "learning_rate": 7.344753747323341e-06, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 591 }, { "completion_length": 124.0, "epoch": 0.6336633663366337, "grad_norm": 0.22264806926250458, "kl": 0.009809597861021757, "learning_rate": 7.323340471092077e-06, "loss": 0.0004, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 592 }, { "completion_length": 137.6875, "epoch": 0.6347337436446348, "grad_norm": 0.4564186930656433, "kl": 0.008125318097881973, "learning_rate": 7.301927194860814e-06, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.46875, "step": 593 }, { "completion_length": 125.0, "epoch": 0.6358041209526358, "grad_norm": 0.42132651805877686, "kl": 0.0197669870685786, "learning_rate": 7.28051391862955e-06, "loss": 0.0008, "reward": 1.21875, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 594 }, { "completion_length": 140.1875, "epoch": 0.6368744982606369, "grad_norm": 0.3397495448589325, "kl": 0.007090369239449501, "learning_rate": 7.259100642398288e-06, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.22097086906433105, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.40625, "step": 595 }, { "completion_length": 141.0, "epoch": 0.637944875568638, "grad_norm": 0.2913299798965454, "kl": 0.011232900200411677, "learning_rate": 7.237687366167024e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.39774754643440247, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.28125, "step": 596 }, { "completion_length": 137.125, "epoch": 0.639015252876639, "grad_norm": 0.3349488377571106, "kl": 0.009077495080418885, "learning_rate": 7.216274089935761e-06, "loss": 0.0004, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 597 }, { "completion_length": 144.5, "epoch": 0.64008563018464, "grad_norm": 0.21602429449558258, "kl": 0.008281277259811759, "learning_rate": 7.194860813704497e-06, "loss": 0.0003, "reward": 1.34375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.46875, "step": 598 }, { "completion_length": 138.9375, "epoch": 0.6411560074926411, "grad_norm": 0.4473321735858917, "kl": 0.0061393583891913295, "learning_rate": 7.173447537473234e-06, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.2651650421321392, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 599 }, { "completion_length": 132.6875, "epoch": 0.6422263848006422, "grad_norm": 0.24021965265274048, "kl": 0.009652534266933799, "learning_rate": 7.15203426124197e-06, "loss": 0.0004, "reward": 1.15625, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 600 }, { "completion_length": 122.75, "epoch": 0.6432967621086433, "grad_norm": 0.15295614302158356, "kl": 0.008629188872873783, "learning_rate": 7.1306209850107075e-06, "loss": 0.0003, "reward": 1.125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 601 }, { "completion_length": 133.6875, "epoch": 0.6443671394166444, "grad_norm": 0.45031824707984924, "kl": 0.028709097765386105, "learning_rate": 7.109207708779444e-06, "loss": 0.0011, "reward": 0.84375, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 602 }, { "completion_length": 149.75, "epoch": 0.6454375167246454, "grad_norm": 0.388879656791687, "kl": 0.005741750355809927, "learning_rate": 7.0877944325481805e-06, "loss": 0.0002, "reward": 1.03125, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 603 }, { "completion_length": 148.375, "epoch": 0.6465078940326465, "grad_norm": 0.27051112055778503, "kl": 0.006536646164022386, "learning_rate": 7.066381156316917e-06, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 604 }, { "completion_length": 139.0, "epoch": 0.6475782713406476, "grad_norm": 0.20055554807186127, "kl": 0.007611157081555575, "learning_rate": 7.0449678800856535e-06, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 605 }, { "completion_length": 131.0625, "epoch": 0.6486486486486487, "grad_norm": 0.17921480536460876, "kl": 0.006959068006835878, "learning_rate": 7.02355460385439e-06, "loss": 0.0003, "reward": 1.125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 606 }, { "completion_length": 130.125, "epoch": 0.6497190259566498, "grad_norm": 0.7518537044525146, "kl": 0.012131765484809875, "learning_rate": 7.0021413276231265e-06, "loss": 0.0005, "reward": 1.0625, "reward_std": 0.6187184229493141, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.5, "step": 607 }, { "completion_length": 146.9375, "epoch": 0.6507894032646507, "grad_norm": 0.3497796654701233, "kl": 0.013562145177274942, "learning_rate": 6.980728051391864e-06, "loss": 0.0005, "reward": 1.15625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 608 }, { "completion_length": 127.1875, "epoch": 0.6518597805726518, "grad_norm": 0.01271997019648552, "kl": 0.006386495311744511, "learning_rate": 6.9593147751606e-06, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 609 }, { "completion_length": 124.3125, "epoch": 0.6529301578806529, "grad_norm": 0.12044579535722733, "kl": 0.015429887222126126, "learning_rate": 6.937901498929337e-06, "loss": 0.0006, "reward": 0.96875, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 610 }, { "completion_length": 138.5625, "epoch": 0.654000535188654, "grad_norm": 0.01911737769842148, "kl": 0.006480256502982229, "learning_rate": 6.916488222698073e-06, "loss": 0.0003, "reward": 1.3125, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.4375, "step": 611 }, { "completion_length": 127.75, "epoch": 0.6550709124966551, "grad_norm": 0.2172321230173111, "kl": 0.006093419040553272, "learning_rate": 6.89507494646681e-06, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 612 }, { "completion_length": 137.9375, "epoch": 0.6561412898046561, "grad_norm": 0.42796948552131653, "kl": 0.011987971956841648, "learning_rate": 6.873661670235546e-06, "loss": 0.0005, "reward": 1.28125, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 613 }, { "completion_length": 146.75, "epoch": 0.6572116671126572, "grad_norm": 0.38820627331733704, "kl": 0.009291197871789336, "learning_rate": 6.852248394004283e-06, "loss": 0.0004, "reward": 1.0, "reward_std": 0.5303300730884075, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 614 }, { "completion_length": 136.5, "epoch": 0.6582820444206583, "grad_norm": 0.14232227206230164, "kl": 0.007353786379098892, "learning_rate": 6.83083511777302e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 615 }, { "completion_length": 150.0, "epoch": 0.6593524217286594, "grad_norm": 0.43917161226272583, "kl": 0.00911213643848896, "learning_rate": 6.809421841541757e-06, "loss": 0.0004, "reward": 0.96875, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 616 }, { "completion_length": 134.8125, "epoch": 0.6604227990366605, "grad_norm": 0.2939397096633911, "kl": 0.007568636443465948, "learning_rate": 6.788008565310493e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 617 }, { "completion_length": 140.6875, "epoch": 0.6614931763446615, "grad_norm": 0.21770115196704865, "kl": 0.008107412548270077, "learning_rate": 6.76659528907923e-06, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 618 }, { "completion_length": 135.6875, "epoch": 0.6625635536526626, "grad_norm": 0.37311598658561707, "kl": 0.009011300629936159, "learning_rate": 6.745182012847966e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.40625, "step": 619 }, { "completion_length": 142.0625, "epoch": 0.6636339309606636, "grad_norm": 0.2269369661808014, "kl": 0.006384911946952343, "learning_rate": 6.723768736616703e-06, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.34375, "step": 620 }, { "completion_length": 142.8125, "epoch": 0.6647043082686647, "grad_norm": 0.10586193203926086, "kl": 0.006069877999834716, "learning_rate": 6.702355460385439e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.34375, "step": 621 }, { "completion_length": 141.5625, "epoch": 0.6657746855766657, "grad_norm": 0.18946292996406555, "kl": 0.005843171966262162, "learning_rate": 6.6809421841541765e-06, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.40625, "step": 622 }, { "completion_length": 135.4375, "epoch": 0.6668450628846668, "grad_norm": 0.18185573816299438, "kl": 0.007373653235845268, "learning_rate": 6.659528907922913e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 623 }, { "completion_length": 131.875, "epoch": 0.6679154401926679, "grad_norm": 0.5080950260162354, "kl": 0.01634050882421434, "learning_rate": 6.6381156316916495e-06, "loss": 0.0007, "reward": 1.03125, "reward_std": 0.48613590747117996, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 624 }, { "completion_length": 123.75, "epoch": 0.668985817500669, "grad_norm": 0.25900015234947205, "kl": 0.008645714027807117, "learning_rate": 6.616702355460386e-06, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 625 }, { "completion_length": 133.0, "epoch": 0.6700561948086701, "grad_norm": 0.41257792711257935, "kl": 0.009297259734012187, "learning_rate": 6.5952890792291225e-06, "loss": 0.0004, "reward": 1.03125, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.46875, "step": 626 }, { "completion_length": 132.0, "epoch": 0.6711265721166711, "grad_norm": 0.3156934678554535, "kl": 0.008284906740300357, "learning_rate": 6.573875802997859e-06, "loss": 0.0003, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 627 }, { "completion_length": 142.9375, "epoch": 0.6721969494246722, "grad_norm": 0.12793464958667755, "kl": 0.013345772167667747, "learning_rate": 6.5524625267665955e-06, "loss": 0.0005, "reward": 1.09375, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 628 }, { "completion_length": 139.9375, "epoch": 0.6732673267326733, "grad_norm": 0.3752954304218292, "kl": 0.007347576203756034, "learning_rate": 6.531049250535333e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.4375, "step": 629 }, { "completion_length": 137.75, "epoch": 0.6743377040406744, "grad_norm": 0.144851416349411, "kl": 0.013656549854204059, "learning_rate": 6.509635974304069e-06, "loss": 0.0005, "reward": 1.21875, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 630 }, { "completion_length": 135.9375, "epoch": 0.6754080813486754, "grad_norm": 0.13626298308372498, "kl": 0.009230749914422631, "learning_rate": 6.488222698072806e-06, "loss": 0.0004, "reward": 1.28125, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 631 }, { "completion_length": 133.3125, "epoch": 0.6764784586566764, "grad_norm": 0.3290061354637146, "kl": 0.009543088031932712, "learning_rate": 6.466809421841542e-06, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 632 }, { "completion_length": 127.25, "epoch": 0.6775488359646775, "grad_norm": 0.294765830039978, "kl": 0.00979562517022714, "learning_rate": 6.445396145610279e-06, "loss": 0.0004, "reward": 1.09375, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 633 }, { "completion_length": 128.6875, "epoch": 0.6786192132726786, "grad_norm": 0.24774757027626038, "kl": 0.028313649352639914, "learning_rate": 6.423982869379015e-06, "loss": 0.0011, "reward": 1.15625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 634 }, { "completion_length": 125.0, "epoch": 0.6796895905806797, "grad_norm": 0.33003249764442444, "kl": 0.007496201666072011, "learning_rate": 6.402569593147752e-06, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 635 }, { "completion_length": 141.75, "epoch": 0.6807599678886808, "grad_norm": 0.321207731962204, "kl": 0.008509653387591243, "learning_rate": 6.381156316916489e-06, "loss": 0.0003, "reward": 1.34375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.46875, "step": 636 }, { "completion_length": 135.375, "epoch": 0.6818303451966818, "grad_norm": 0.33936479687690735, "kl": 0.012052393751218915, "learning_rate": 6.359743040685226e-06, "loss": 0.0005, "reward": 0.6875, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.375, "step": 637 }, { "completion_length": 135.5, "epoch": 0.6829007225046829, "grad_norm": 0.3490801751613617, "kl": 0.010464445746038109, "learning_rate": 6.338329764453962e-06, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 638 }, { "completion_length": 118.625, "epoch": 0.683971099812684, "grad_norm": 0.35179266333580017, "kl": 0.008177536423318088, "learning_rate": 6.316916488222699e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 639 }, { "completion_length": 117.9375, "epoch": 0.6850414771206851, "grad_norm": 0.015332052484154701, "kl": 0.007892038207501173, "learning_rate": 6.295503211991435e-06, "loss": 0.0003, "reward": 1.25, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 640 }, { "completion_length": 137.875, "epoch": 0.6861118544286862, "grad_norm": 0.35262179374694824, "kl": 0.008212179644033313, "learning_rate": 6.274089935760172e-06, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 641 }, { "completion_length": 135.0625, "epoch": 0.6871822317366871, "grad_norm": 0.2762969136238098, "kl": 0.007544721651356667, "learning_rate": 6.252676659528908e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 642 }, { "completion_length": 140.125, "epoch": 0.6882526090446882, "grad_norm": 0.3334350883960724, "kl": 0.009078879142180085, "learning_rate": 6.2312633832976455e-06, "loss": 0.0004, "reward": 1.21875, "reward_std": 0.39774756133556366, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 643 }, { "completion_length": 133.8125, "epoch": 0.6893229863526893, "grad_norm": 0.2063576579093933, "kl": 0.008034960948862135, "learning_rate": 6.209850107066382e-06, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 644 }, { "completion_length": 137.5, "epoch": 0.6903933636606904, "grad_norm": 0.22979490458965302, "kl": 0.006848543067462742, "learning_rate": 6.1884368308351185e-06, "loss": 0.0003, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 645 }, { "completion_length": 134.0625, "epoch": 0.6914637409686915, "grad_norm": 0.3237832486629486, "kl": 0.009535222663544118, "learning_rate": 6.167023554603855e-06, "loss": 0.0004, "reward": 1.09375, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 646 }, { "completion_length": 119.125, "epoch": 0.6925341182766925, "grad_norm": 0.07546544820070267, "kl": 0.021992412977851927, "learning_rate": 6.1456102783725915e-06, "loss": 0.0009, "reward": 1.1875, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 647 }, { "completion_length": 137.1875, "epoch": 0.6936044955846936, "grad_norm": 0.35371455550193787, "kl": 0.009388085105456412, "learning_rate": 6.124197002141328e-06, "loss": 0.0004, "reward": 1.125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 648 }, { "completion_length": 133.5, "epoch": 0.6946748728926947, "grad_norm": 0.32626765966415405, "kl": 0.01321537815965712, "learning_rate": 6.102783725910065e-06, "loss": 0.0005, "reward": 1.25, "reward_std": 0.3535533770918846, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 649 }, { "completion_length": 139.375, "epoch": 0.6957452502006958, "grad_norm": 0.3864395022392273, "kl": 0.0066954439971596, "learning_rate": 6.081370449678802e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 650 }, { "completion_length": 139.875, "epoch": 0.6968156275086969, "grad_norm": 0.17655745148658752, "kl": 0.006929204449988902, "learning_rate": 6.059957173447538e-06, "loss": 0.0003, "reward": 1.34375, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.46875, "step": 651 }, { "completion_length": 137.4375, "epoch": 0.6978860048166979, "grad_norm": 0.33431926369667053, "kl": 0.013877897057682276, "learning_rate": 6.038543897216275e-06, "loss": 0.0006, "reward": 1.25, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 652 }, { "completion_length": 124.4375, "epoch": 0.6989563821246989, "grad_norm": 0.3874164819717407, "kl": 0.013383459998294711, "learning_rate": 6.017130620985011e-06, "loss": 0.0005, "reward": 1.125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 653 }, { "completion_length": 141.9375, "epoch": 0.7000267594327, "grad_norm": 0.2784304618835449, "kl": 0.007713288418017328, "learning_rate": 5.995717344753748e-06, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 654 }, { "completion_length": 129.375, "epoch": 0.7010971367407011, "grad_norm": 0.22470128536224365, "kl": 0.020239718025550246, "learning_rate": 5.974304068522484e-06, "loss": 0.0008, "reward": 1.09375, "reward_std": 0.22097086906433105, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 655 }, { "completion_length": 128.6875, "epoch": 0.7021675140487021, "grad_norm": 0.3159515857696533, "kl": 0.010750668006949127, "learning_rate": 5.952890792291222e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.40625, "step": 656 }, { "completion_length": 135.3125, "epoch": 0.7032378913567032, "grad_norm": 0.011365106329321861, "kl": 0.01103117666207254, "learning_rate": 5.931477516059958e-06, "loss": 0.0004, "reward": 1.0, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 657 }, { "completion_length": 123.8125, "epoch": 0.7043082686647043, "grad_norm": 0.3366810083389282, "kl": 0.006276113970670849, "learning_rate": 5.910064239828695e-06, "loss": 0.0003, "reward": 1.3125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 658 }, { "completion_length": 141.875, "epoch": 0.7053786459727054, "grad_norm": 0.3760837912559509, "kl": 0.006531917490065098, "learning_rate": 5.888650963597431e-06, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 659 }, { "completion_length": 126.9375, "epoch": 0.7064490232807065, "grad_norm": 0.2456383854150772, "kl": 0.009985500480979681, "learning_rate": 5.867237687366168e-06, "loss": 0.0004, "reward": 1.34375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.46875, "step": 660 }, { "completion_length": 133.4375, "epoch": 0.7075194005887075, "grad_norm": 0.23309442400932312, "kl": 0.0091017906088382, "learning_rate": 5.845824411134904e-06, "loss": 0.0004, "reward": 1.15625, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.40625, "step": 661 }, { "completion_length": 134.8125, "epoch": 0.7085897778967086, "grad_norm": 0.36190518736839294, "kl": 0.009649162413552403, "learning_rate": 5.824411134903641e-06, "loss": 0.0004, "reward": 1.0625, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 662 }, { "completion_length": 132.3125, "epoch": 0.7096601552047097, "grad_norm": 0.3767833113670349, "kl": 0.012279619462788105, "learning_rate": 5.802997858672378e-06, "loss": 0.0005, "reward": 1.125, "reward_std": 0.5303300768136978, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 663 }, { "completion_length": 111.6875, "epoch": 0.7107305325127107, "grad_norm": 0.49927473068237305, "kl": 0.009873309871181846, "learning_rate": 5.7815845824411145e-06, "loss": 0.0004, "reward": 1.1875, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.5, "step": 664 }, { "completion_length": 139.5625, "epoch": 0.7118009098207118, "grad_norm": 0.3030162453651428, "kl": 0.013099351665005088, "learning_rate": 5.760171306209851e-06, "loss": 0.0005, "reward": 1.375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 665 }, { "completion_length": 121.125, "epoch": 0.7128712871287128, "grad_norm": 0.4062125086784363, "kl": 0.01066346128936857, "learning_rate": 5.7387580299785874e-06, "loss": 0.0004, "reward": 0.9375, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.4375, "step": 666 }, { "completion_length": 137.9375, "epoch": 0.7139416644367139, "grad_norm": 0.285330593585968, "kl": 0.010435440577566624, "learning_rate": 5.717344753747324e-06, "loss": 0.0004, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 667 }, { "completion_length": 134.0625, "epoch": 0.715012041744715, "grad_norm": 0.3382175862789154, "kl": 0.010269683320075274, "learning_rate": 5.6959314775160604e-06, "loss": 0.0004, "reward": 1.1875, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 668 }, { "completion_length": 136.5, "epoch": 0.7160824190527161, "grad_norm": 0.4115214943885803, "kl": 0.009192675817757845, "learning_rate": 5.674518201284797e-06, "loss": 0.0004, "reward": 1.0, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 669 }, { "completion_length": 137.875, "epoch": 0.7171527963607172, "grad_norm": 0.4873230755329132, "kl": 0.009984387317672372, "learning_rate": 5.653104925053534e-06, "loss": 0.0004, "reward": 1.1875, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 670 }, { "completion_length": 119.375, "epoch": 0.7182231736687182, "grad_norm": 0.44264909625053406, "kl": 0.012268287828192115, "learning_rate": 5.631691648822271e-06, "loss": 0.0005, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 671 }, { "completion_length": 138.9375, "epoch": 0.7192935509767193, "grad_norm": 0.28828421235084534, "kl": 0.007146801159251481, "learning_rate": 5.610278372591007e-06, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.25, "step": 672 }, { "completion_length": 140.0, "epoch": 0.7203639282847204, "grad_norm": 0.28256407380104065, "kl": 0.005234807846136391, "learning_rate": 5.588865096359744e-06, "loss": 0.0002, "reward": 1.0, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 673 }, { "completion_length": 126.5, "epoch": 0.7214343055927215, "grad_norm": 0.39136582612991333, "kl": 0.007678528083488345, "learning_rate": 5.56745182012848e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 674 }, { "completion_length": 145.1875, "epoch": 0.7225046829007225, "grad_norm": 0.21063315868377686, "kl": 0.005507475812919438, "learning_rate": 5.546038543897217e-06, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 675 }, { "completion_length": 130.0625, "epoch": 0.7235750602087235, "grad_norm": 0.25470808148384094, "kl": 0.007666324032470584, "learning_rate": 5.524625267665953e-06, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.5, "step": 676 }, { "completion_length": 127.5, "epoch": 0.7246454375167246, "grad_norm": 0.026533028110861778, "kl": 0.011546806432306767, "learning_rate": 5.503211991434691e-06, "loss": 0.0005, "reward": 1.375, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 677 }, { "completion_length": 119.1875, "epoch": 0.7257158148247257, "grad_norm": 0.15099424123764038, "kl": 0.01650574477389455, "learning_rate": 5.481798715203427e-06, "loss": 0.0007, "reward": 1.4375, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.9375, "rewards/format_reward_func_qa": 0.5, "step": 678 }, { "completion_length": 137.0625, "epoch": 0.7267861921327268, "grad_norm": 0.42659008502960205, "kl": 0.016689285403117537, "learning_rate": 5.460385438972164e-06, "loss": 0.0007, "reward": 1.21875, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 679 }, { "completion_length": 115.6875, "epoch": 0.7278565694407279, "grad_norm": 0.33350950479507446, "kl": 0.006778756738640368, "learning_rate": 5.4389721627409e-06, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 680 }, { "completion_length": 132.0, "epoch": 0.7289269467487289, "grad_norm": 0.29262155294418335, "kl": 0.010527301114052534, "learning_rate": 5.417558886509637e-06, "loss": 0.0004, "reward": 0.875, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.3125, "step": 681 }, { "completion_length": 130.75, "epoch": 0.72999732405673, "grad_norm": 0.2774050533771515, "kl": 0.007614015485160053, "learning_rate": 5.396145610278373e-06, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 682 }, { "completion_length": 139.1875, "epoch": 0.7310677013647311, "grad_norm": 0.3175061345100403, "kl": 0.005789973074570298, "learning_rate": 5.374732334047109e-06, "loss": 0.0002, "reward": 1.125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 683 }, { "completion_length": 135.0, "epoch": 0.7321380786727322, "grad_norm": 0.3578486144542694, "kl": 0.0068925535306334496, "learning_rate": 5.353319057815847e-06, "loss": 0.0003, "reward": 1.125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 684 }, { "completion_length": 129.8125, "epoch": 0.7332084559807333, "grad_norm": 0.3105612099170685, "kl": 0.015828246949240565, "learning_rate": 5.3319057815845834e-06, "loss": 0.0006, "reward": 0.90625, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 685 }, { "completion_length": 111.6875, "epoch": 0.7342788332887343, "grad_norm": 0.22875040769577026, "kl": 0.008531110710464418, "learning_rate": 5.31049250535332e-06, "loss": 0.0003, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 686 }, { "completion_length": 114.125, "epoch": 0.7353492105967353, "grad_norm": 0.2765522003173828, "kl": 0.009360921918414533, "learning_rate": 5.2890792291220564e-06, "loss": 0.0004, "reward": 1.125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 687 }, { "completion_length": 122.3125, "epoch": 0.7364195879047364, "grad_norm": 0.32797130942344666, "kl": 0.008344789501279593, "learning_rate": 5.267665952890793e-06, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.39774756133556366, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 688 }, { "completion_length": 143.25, "epoch": 0.7374899652127375, "grad_norm": 0.33672067523002625, "kl": 0.00890603190055117, "learning_rate": 5.2462526766595286e-06, "loss": 0.0004, "reward": 1.03125, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 689 }, { "completion_length": 129.25, "epoch": 0.7385603425207385, "grad_norm": 0.1356227844953537, "kl": 0.007539765443652868, "learning_rate": 5.224839400428265e-06, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 690 }, { "completion_length": 141.9375, "epoch": 0.7396307198287396, "grad_norm": 0.2940700352191925, "kl": 0.008551972452551126, "learning_rate": 5.203426124197003e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 691 }, { "completion_length": 140.8125, "epoch": 0.7407010971367407, "grad_norm": 0.3995163142681122, "kl": 0.009525564732030034, "learning_rate": 5.18201284796574e-06, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.39774755761027336, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 692 }, { "completion_length": 130.5625, "epoch": 0.7417714744447418, "grad_norm": 0.23215217888355255, "kl": 0.012463737977668643, "learning_rate": 5.160599571734476e-06, "loss": 0.0005, "reward": 1.125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 693 }, { "completion_length": 131.0625, "epoch": 0.7428418517527429, "grad_norm": 0.2529661953449249, "kl": 0.009736015577800572, "learning_rate": 5.139186295503213e-06, "loss": 0.0004, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 694 }, { "completion_length": 136.8125, "epoch": 0.743912229060744, "grad_norm": 0.3215126097202301, "kl": 0.007723555318079889, "learning_rate": 5.117773019271948e-06, "loss": 0.0003, "reward": 1.34375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.46875, "step": 695 }, { "completion_length": 135.4375, "epoch": 0.744982606368745, "grad_norm": 0.32758599519729614, "kl": 0.007883366430178285, "learning_rate": 5.096359743040685e-06, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 696 }, { "completion_length": 145.75, "epoch": 0.7460529836767461, "grad_norm": 0.27535104751586914, "kl": 0.007911258959211409, "learning_rate": 5.074946466809421e-06, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 697 }, { "completion_length": 130.75, "epoch": 0.7471233609847471, "grad_norm": 0.3650718927383423, "kl": 0.011625023442320526, "learning_rate": 5.05353319057816e-06, "loss": 0.0005, "reward": 1.21875, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.40625, "step": 698 }, { "completion_length": 132.3125, "epoch": 0.7481937382927482, "grad_norm": 0.18983222544193268, "kl": 0.009241326129995286, "learning_rate": 5.032119914346896e-06, "loss": 0.0004, "reward": 1.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.5, "step": 699 }, { "completion_length": 139.75, "epoch": 0.7492641156007492, "grad_norm": 0.2799305021762848, "kl": 0.011063303099945188, "learning_rate": 5.010706638115633e-06, "loss": 0.0004, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 700 }, { "completion_length": 133.125, "epoch": 0.7503344929087503, "grad_norm": 0.4518125653266907, "kl": 0.01513196830637753, "learning_rate": 4.989293361884368e-06, "loss": 0.0006, "reward": 0.84375, "reward_std": 0.48613590747117996, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 701 }, { "completion_length": 124.9375, "epoch": 0.7514048702167514, "grad_norm": 0.23378019034862518, "kl": 0.015026301378384233, "learning_rate": 4.967880085653105e-06, "loss": 0.0006, "reward": 1.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.5, "step": 702 }, { "completion_length": 117.0, "epoch": 0.7524752475247525, "grad_norm": 0.2737342119216919, "kl": 0.021560787805356085, "learning_rate": 4.946466809421842e-06, "loss": 0.0009, "reward": 1.1875, "reward_std": 0.4419417232275009, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 703 }, { "completion_length": 136.5625, "epoch": 0.7535456248327536, "grad_norm": 0.40370312333106995, "kl": 0.007427525473758578, "learning_rate": 4.9250535331905786e-06, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.39774756133556366, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 704 }, { "completion_length": 133.8125, "epoch": 0.7546160021407546, "grad_norm": 0.37258180975914, "kl": 0.006497728987596929, "learning_rate": 4.903640256959315e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.3125, "step": 705 }, { "completion_length": 113.875, "epoch": 0.7556863794487557, "grad_norm": 0.2611464858055115, "kl": 0.019363462459295988, "learning_rate": 4.882226980728052e-06, "loss": 0.0008, "reward": 1.03125, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 706 }, { "completion_length": 130.375, "epoch": 0.7567567567567568, "grad_norm": 0.3427421748638153, "kl": 0.00915329356212169, "learning_rate": 4.860813704496788e-06, "loss": 0.0004, "reward": 1.34375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.46875, "step": 707 }, { "completion_length": 149.5, "epoch": 0.7578271340647579, "grad_norm": 0.27053922414779663, "kl": 0.007040401687845588, "learning_rate": 4.8394004282655246e-06, "loss": 0.0003, "reward": 1.34375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.46875, "step": 708 }, { "completion_length": 137.125, "epoch": 0.7588975113727588, "grad_norm": 0.2616964876651764, "kl": 0.007255143485963345, "learning_rate": 4.817987152034261e-06, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.48613589257001877, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 709 }, { "completion_length": 139.25, "epoch": 0.7599678886807599, "grad_norm": 0.2552794814109802, "kl": 0.010888232616707683, "learning_rate": 4.796573875802998e-06, "loss": 0.0004, "reward": 0.6875, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.25, "step": 710 }, { "completion_length": 122.9375, "epoch": 0.761038265988761, "grad_norm": 0.3355708122253418, "kl": 0.01230055675841868, "learning_rate": 4.775160599571735e-06, "loss": 0.0005, "reward": 1.1875, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 711 }, { "completion_length": 132.375, "epoch": 0.7621086432967621, "grad_norm": 0.02233867347240448, "kl": 0.011891684727743268, "learning_rate": 4.753747323340471e-06, "loss": 0.0005, "reward": 1.0, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 712 }, { "completion_length": 133.75, "epoch": 0.7631790206047632, "grad_norm": 0.25756651163101196, "kl": 0.007048476487398148, "learning_rate": 4.732334047109208e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.28125, "step": 713 }, { "completion_length": 141.125, "epoch": 0.7642493979127643, "grad_norm": 0.3174949586391449, "kl": 0.008112253388389945, "learning_rate": 4.710920770877944e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.4375, "step": 714 }, { "completion_length": 125.8125, "epoch": 0.7653197752207653, "grad_norm": 0.12343992292881012, "kl": 0.007230265997350216, "learning_rate": 4.689507494646681e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 715 }, { "completion_length": 145.5, "epoch": 0.7663901525287664, "grad_norm": 0.38323333859443665, "kl": 0.007825030945241451, "learning_rate": 4.668094218415418e-06, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 716 }, { "completion_length": 147.8125, "epoch": 0.7674605298367675, "grad_norm": 0.2288675755262375, "kl": 0.010019779205322266, "learning_rate": 4.646680942184155e-06, "loss": 0.0004, "reward": 1.0625, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 717 }, { "completion_length": 138.5, "epoch": 0.7685309071447686, "grad_norm": 0.24786271154880524, "kl": 0.008649087743833661, "learning_rate": 4.625267665952891e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 718 }, { "completion_length": 140.3125, "epoch": 0.7696012844527697, "grad_norm": 0.44203072786331177, "kl": 0.009343030746094882, "learning_rate": 4.603854389721628e-06, "loss": 0.0004, "reward": 1.09375, "reward_std": 0.39774755015969276, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 719 }, { "completion_length": 127.625, "epoch": 0.7706716617607706, "grad_norm": 0.3602650761604309, "kl": 0.01164180925115943, "learning_rate": 4.582441113490364e-06, "loss": 0.0005, "reward": 0.9375, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.4375, "step": 720 }, { "completion_length": 119.6875, "epoch": 0.7717420390687717, "grad_norm": 0.13708187639713287, "kl": 0.011513182427734137, "learning_rate": 4.561027837259101e-06, "loss": 0.0005, "reward": 1.4375, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.9375, "rewards/format_reward_func_qa": 0.5, "step": 721 }, { "completion_length": 133.6875, "epoch": 0.7728124163767728, "grad_norm": 0.41205987334251404, "kl": 0.007739911554381251, "learning_rate": 4.539614561027837e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.4375, "step": 722 }, { "completion_length": 139.6875, "epoch": 0.7738827936847739, "grad_norm": 0.4428892433643341, "kl": 0.012770616100169718, "learning_rate": 4.5182012847965746e-06, "loss": 0.0005, "reward": 1.15625, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.40625, "step": 723 }, { "completion_length": 132.8125, "epoch": 0.7749531709927749, "grad_norm": 0.20104606449604034, "kl": 0.008847060962580144, "learning_rate": 4.496788008565311e-06, "loss": 0.0004, "reward": 1.3125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 724 }, { "completion_length": 130.6875, "epoch": 0.776023548300776, "grad_norm": 0.47708559036254883, "kl": 0.013226014911197126, "learning_rate": 4.4753747323340476e-06, "loss": 0.0005, "reward": 0.84375, "reward_std": 0.6629125848412514, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 725 }, { "completion_length": 128.125, "epoch": 0.7770939256087771, "grad_norm": 0.018889298662543297, "kl": 0.008179247146472335, "learning_rate": 4.453961456102784e-06, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 726 }, { "completion_length": 114.875, "epoch": 0.7781643029167782, "grad_norm": 0.297990620136261, "kl": 0.01714193425141275, "learning_rate": 4.4325481798715205e-06, "loss": 0.0007, "reward": 0.75, "reward_std": 0.2651650421321392, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.375, "step": 727 }, { "completion_length": 130.5625, "epoch": 0.7792346802247793, "grad_norm": 0.4072510004043579, "kl": 0.009169632103294134, "learning_rate": 4.411134903640257e-06, "loss": 0.0004, "reward": 1.25, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 728 }, { "completion_length": 145.8125, "epoch": 0.7803050575327803, "grad_norm": 0.29128971695899963, "kl": 0.01060357061214745, "learning_rate": 4.3897216274089935e-06, "loss": 0.0004, "reward": 1.03125, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 729 }, { "completion_length": 130.1875, "epoch": 0.7813754348407814, "grad_norm": 0.5146282911300659, "kl": 0.009237196878530085, "learning_rate": 4.368308351177731e-06, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.662912592291832, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 730 }, { "completion_length": 124.5, "epoch": 0.7824458121487824, "grad_norm": 0.17889198660850525, "kl": 0.013388351071625948, "learning_rate": 4.346895074946467e-06, "loss": 0.0005, "reward": 0.96875, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.46875, "step": 731 }, { "completion_length": 138.875, "epoch": 0.7835161894567835, "grad_norm": 0.3132176697254181, "kl": 0.008426614571362734, "learning_rate": 4.325481798715204e-06, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.4419417232275009, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 732 }, { "completion_length": 127.0, "epoch": 0.7845865667647846, "grad_norm": 0.3185586929321289, "kl": 0.0067036411492154, "learning_rate": 4.30406852248394e-06, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 733 }, { "completion_length": 138.0625, "epoch": 0.7856569440727856, "grad_norm": 0.35601097345352173, "kl": 0.009409032645635307, "learning_rate": 4.282655246252677e-06, "loss": 0.0004, "reward": 0.96875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.46875, "step": 734 }, { "completion_length": 142.3125, "epoch": 0.7867273213807867, "grad_norm": 0.24955964088439941, "kl": 0.008224638178944588, "learning_rate": 4.261241970021413e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.4375, "step": 735 }, { "completion_length": 143.875, "epoch": 0.7877976986887878, "grad_norm": 0.40461158752441406, "kl": 0.005321103963069618, "learning_rate": 4.23982869379015e-06, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 736 }, { "completion_length": 141.25, "epoch": 0.7888680759967889, "grad_norm": 0.3201189637184143, "kl": 0.01058421260677278, "learning_rate": 4.218415417558887e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 737 }, { "completion_length": 129.9375, "epoch": 0.78993845330479, "grad_norm": 0.5239900350570679, "kl": 0.02398638939484954, "learning_rate": 4.197002141327624e-06, "loss": 0.001, "reward": 1.0, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.5, "step": 738 }, { "completion_length": 139.125, "epoch": 0.791008830612791, "grad_norm": 0.512116551399231, "kl": 0.012146550696343184, "learning_rate": 4.17558886509636e-06, "loss": 0.0005, "reward": 0.78125, "reward_std": 0.48613590002059937, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.40625, "step": 739 }, { "completion_length": 136.375, "epoch": 0.7920792079207921, "grad_norm": 0.18656423687934875, "kl": 0.005261167068965733, "learning_rate": 4.154175588865097e-06, "loss": 0.0002, "reward": 0.875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 740 }, { "completion_length": 139.5, "epoch": 0.7931495852287932, "grad_norm": 0.09205947816371918, "kl": 0.006993141782004386, "learning_rate": 4.132762312633833e-06, "loss": 0.0003, "reward": 1.4375, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.9375, "rewards/format_reward_func_qa": 0.5, "step": 741 }, { "completion_length": 133.125, "epoch": 0.7942199625367942, "grad_norm": 0.3071753978729248, "kl": 0.006768398452550173, "learning_rate": 4.11134903640257e-06, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 742 }, { "completion_length": 129.4375, "epoch": 0.7952903398447952, "grad_norm": 0.42720672488212585, "kl": 0.005937354522757232, "learning_rate": 4.089935760171306e-06, "loss": 0.0002, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.5, "step": 743 }, { "completion_length": 129.375, "epoch": 0.7963607171527963, "grad_norm": 0.19865697622299194, "kl": 0.00743037078063935, "learning_rate": 4.0685224839400435e-06, "loss": 0.0003, "reward": 1.375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 744 }, { "completion_length": 127.75, "epoch": 0.7974310944607974, "grad_norm": 0.4268759489059448, "kl": 0.009780740016140044, "learning_rate": 4.04710920770878e-06, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.39774755761027336, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 745 }, { "completion_length": 142.5625, "epoch": 0.7985014717687985, "grad_norm": 0.4154244065284729, "kl": 0.009035979746840894, "learning_rate": 4.0256959314775165e-06, "loss": 0.0004, "reward": 1.03125, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 746 }, { "completion_length": 138.4375, "epoch": 0.7995718490767996, "grad_norm": 0.32028982043266296, "kl": 0.009531363379210234, "learning_rate": 4.004282655246253e-06, "loss": 0.0004, "reward": 1.15625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 747 }, { "completion_length": 121.6875, "epoch": 0.8006422263848006, "grad_norm": 0.21553994715213776, "kl": 0.007405009237118065, "learning_rate": 3.9828693790149895e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 748 }, { "completion_length": 144.375, "epoch": 0.8017126036928017, "grad_norm": 0.41427382826805115, "kl": 0.01168112875893712, "learning_rate": 3.961456102783726e-06, "loss": 0.0005, "reward": 0.96875, "reward_std": 0.48613590747117996, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 749 }, { "completion_length": 138.625, "epoch": 0.8027829810008028, "grad_norm": 0.38947194814682007, "kl": 0.007287310610990971, "learning_rate": 3.9400428265524625e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 750 }, { "completion_length": 124.5625, "epoch": 0.8038533583088039, "grad_norm": 0.2700449526309967, "kl": 0.015748731093481183, "learning_rate": 3.9186295503212e-06, "loss": 0.0006, "reward": 1.0, "reward_std": 0.2651650421321392, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 751 }, { "completion_length": 114.0625, "epoch": 0.804923735616805, "grad_norm": 0.29003432393074036, "kl": 0.011626899824477732, "learning_rate": 3.897216274089936e-06, "loss": 0.0005, "reward": 1.09375, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 752 }, { "completion_length": 145.125, "epoch": 0.8059941129248059, "grad_norm": 0.4052455425262451, "kl": 0.009512482327409089, "learning_rate": 3.875802997858673e-06, "loss": 0.0004, "reward": 1.03125, "reward_std": 0.30935920402407646, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 753 }, { "completion_length": 136.5625, "epoch": 0.807064490232807, "grad_norm": 0.2377774566411972, "kl": 0.007985687348991632, "learning_rate": 3.854389721627409e-06, "loss": 0.0003, "reward": 1.4375, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.9375, "rewards/format_reward_func_qa": 0.5, "step": 754 }, { "completion_length": 142.375, "epoch": 0.8081348675408081, "grad_norm": 0.3360620141029358, "kl": 0.015136545756831765, "learning_rate": 3.832976445396146e-06, "loss": 0.0006, "reward": 1.0625, "reward_std": 0.4419417195022106, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 755 }, { "completion_length": 145.9375, "epoch": 0.8092052448488092, "grad_norm": 0.3175034523010254, "kl": 0.00638822210021317, "learning_rate": 3.8115631691648823e-06, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.4419417232275009, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 756 }, { "completion_length": 120.5, "epoch": 0.8102756221568103, "grad_norm": 0.2903161644935608, "kl": 0.0092542968923226, "learning_rate": 3.790149892933619e-06, "loss": 0.0004, "reward": 1.34375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.46875, "step": 757 }, { "completion_length": 123.625, "epoch": 0.8113459994648113, "grad_norm": 0.16952285170555115, "kl": 0.0055615007295273244, "learning_rate": 3.7687366167023558e-06, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 758 }, { "completion_length": 135.0, "epoch": 0.8124163767728124, "grad_norm": 0.38953617215156555, "kl": 0.008708729175850749, "learning_rate": 3.7473233404710923e-06, "loss": 0.0003, "reward": 0.8125, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.375, "step": 759 }, { "completion_length": 140.375, "epoch": 0.8134867540808135, "grad_norm": 0.29380476474761963, "kl": 0.009717755485326052, "learning_rate": 3.7259100642398288e-06, "loss": 0.0004, "reward": 1.3125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.4375, "step": 760 }, { "completion_length": 135.0, "epoch": 0.8145571313888146, "grad_norm": 0.31055009365081787, "kl": 0.011133818654343486, "learning_rate": 3.7044967880085657e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 761 }, { "completion_length": 138.3125, "epoch": 0.8156275086968157, "grad_norm": 0.183935284614563, "kl": 0.007834234391339123, "learning_rate": 3.683083511777302e-06, "loss": 0.0003, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 762 }, { "completion_length": 133.625, "epoch": 0.8166978860048167, "grad_norm": 0.39877253770828247, "kl": 0.007954739907290787, "learning_rate": 3.6616702355460387e-06, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 763 }, { "completion_length": 142.6875, "epoch": 0.8177682633128178, "grad_norm": 0.38166213035583496, "kl": 0.008339807973243296, "learning_rate": 3.640256959314775e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.3125, "step": 764 }, { "completion_length": 143.25, "epoch": 0.8188386406208188, "grad_norm": 0.11621326208114624, "kl": 0.00856549059972167, "learning_rate": 3.618843683083512e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 765 }, { "completion_length": 126.9375, "epoch": 0.8199090179288199, "grad_norm": 0.010702353902161121, "kl": 0.00553610606584698, "learning_rate": 3.5974304068522486e-06, "loss": 0.0002, "reward": 1.125, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.5, "step": 766 }, { "completion_length": 150.0, "epoch": 0.820979395236821, "grad_norm": 0.41128456592559814, "kl": 0.006315740291029215, "learning_rate": 3.576017130620985e-06, "loss": 0.0003, "reward": 0.75, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.375, "step": 767 }, { "completion_length": 135.5625, "epoch": 0.822049772544822, "grad_norm": 0.09094851464033127, "kl": 0.01035697991028428, "learning_rate": 3.554603854389722e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.28125, "step": 768 }, { "completion_length": 126.375, "epoch": 0.8231201498528231, "grad_norm": 0.4191420078277588, "kl": 0.021569173084571958, "learning_rate": 3.5331905781584585e-06, "loss": 0.0009, "reward": 1.1875, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 769 }, { "completion_length": 139.5625, "epoch": 0.8241905271608242, "grad_norm": 0.4280831217765808, "kl": 0.007510704919695854, "learning_rate": 3.511777301927195e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.40625, "step": 770 }, { "completion_length": 138.0, "epoch": 0.8252609044688253, "grad_norm": 0.35253897309303284, "kl": 0.010257675778120756, "learning_rate": 3.490364025695932e-06, "loss": 0.0004, "reward": 1.21875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 771 }, { "completion_length": 138.875, "epoch": 0.8263312817768264, "grad_norm": 0.4066043198108673, "kl": 0.007693257532082498, "learning_rate": 3.4689507494646684e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 772 }, { "completion_length": 126.8125, "epoch": 0.8274016590848274, "grad_norm": 0.237070694565773, "kl": 0.008008228498511016, "learning_rate": 3.447537473233405e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 773 }, { "completion_length": 124.8125, "epoch": 0.8284720363928285, "grad_norm": 0.13488724827766418, "kl": 0.012019463698379695, "learning_rate": 3.4261241970021414e-06, "loss": 0.0005, "reward": 1.21875, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 774 }, { "completion_length": 146.5625, "epoch": 0.8295424137008296, "grad_norm": 0.26293399930000305, "kl": 0.008736005169339478, "learning_rate": 3.4047109207708783e-06, "loss": 0.0003, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.5, "step": 775 }, { "completion_length": 142.25, "epoch": 0.8306127910088306, "grad_norm": 0.17039544880390167, "kl": 0.006958545185625553, "learning_rate": 3.383297644539615e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 776 }, { "completion_length": 129.5625, "epoch": 0.8316831683168316, "grad_norm": 0.21733544766902924, "kl": 0.0077228323789313436, "learning_rate": 3.3618843683083513e-06, "loss": 0.0003, "reward": 1.25, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 777 }, { "completion_length": 133.875, "epoch": 0.8327535456248327, "grad_norm": 0.36786606907844543, "kl": 0.012740704114548862, "learning_rate": 3.3404710920770882e-06, "loss": 0.0005, "reward": 1.25, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 778 }, { "completion_length": 132.125, "epoch": 0.8338239229328338, "grad_norm": 0.24922770261764526, "kl": 0.01986302505247295, "learning_rate": 3.3190578158458247e-06, "loss": 0.0008, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 779 }, { "completion_length": 144.75, "epoch": 0.8348943002408349, "grad_norm": 0.2995467483997345, "kl": 0.006097907433286309, "learning_rate": 3.2976445396145612e-06, "loss": 0.0002, "reward": 1.0, "reward_std": 0.3535533770918846, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 780 }, { "completion_length": 132.0, "epoch": 0.835964677548836, "grad_norm": 0.015121438540518284, "kl": 0.010043411748483777, "learning_rate": 3.2762312633832977e-06, "loss": 0.0004, "reward": 1.1875, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 781 }, { "completion_length": 128.875, "epoch": 0.837035054856837, "grad_norm": 0.13605183362960815, "kl": 0.01055660075508058, "learning_rate": 3.2548179871520347e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 782 }, { "completion_length": 136.0625, "epoch": 0.8381054321648381, "grad_norm": 0.21722069382667542, "kl": 0.009596669115126133, "learning_rate": 3.233404710920771e-06, "loss": 0.0004, "reward": 1.03125, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 783 }, { "completion_length": 123.0, "epoch": 0.8391758094728392, "grad_norm": 0.1666470170021057, "kl": 0.008349197276402265, "learning_rate": 3.2119914346895077e-06, "loss": 0.0003, "reward": 1.4375, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.9375, "rewards/format_reward_func_qa": 0.5, "step": 784 }, { "completion_length": 133.1875, "epoch": 0.8402461867808403, "grad_norm": 0.44361019134521484, "kl": 0.009646572405472398, "learning_rate": 3.1905781584582446e-06, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 785 }, { "completion_length": 138.4375, "epoch": 0.8413165640888414, "grad_norm": 0.49110177159309387, "kl": 0.010870481026358902, "learning_rate": 3.169164882226981e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.39774755015969276, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.40625, "step": 786 }, { "completion_length": 124.75, "epoch": 0.8423869413968423, "grad_norm": 0.27867090702056885, "kl": 0.006642708147410303, "learning_rate": 3.1477516059957176e-06, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 787 }, { "completion_length": 148.9375, "epoch": 0.8434573187048434, "grad_norm": 0.5652334094047546, "kl": 0.013323272345587611, "learning_rate": 3.126338329764454e-06, "loss": 0.0005, "reward": 1.09375, "reward_std": 0.39774755761027336, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 788 }, { "completion_length": 125.0625, "epoch": 0.8445276960128445, "grad_norm": 0.3123376667499542, "kl": 0.007458559761289507, "learning_rate": 3.104925053533191e-06, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 789 }, { "completion_length": 138.5625, "epoch": 0.8455980733208456, "grad_norm": 0.19149772822856903, "kl": 0.006641577347181737, "learning_rate": 3.0835117773019275e-06, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 790 }, { "completion_length": 132.1875, "epoch": 0.8466684506288467, "grad_norm": 0.36270633339881897, "kl": 0.010878173867240548, "learning_rate": 3.062098501070664e-06, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.34375, "step": 791 }, { "completion_length": 130.4375, "epoch": 0.8477388279368477, "grad_norm": 0.5001871585845947, "kl": 0.012055143190082163, "learning_rate": 3.040685224839401e-06, "loss": 0.0005, "reward": 1.0625, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 792 }, { "completion_length": 146.5625, "epoch": 0.8488092052448488, "grad_norm": 0.3140570819377899, "kl": 0.006688898545689881, "learning_rate": 3.0192719486081374e-06, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 793 }, { "completion_length": 135.5, "epoch": 0.8498795825528499, "grad_norm": 0.2951427102088928, "kl": 0.007249632850289345, "learning_rate": 2.997858672376874e-06, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 794 }, { "completion_length": 123.8125, "epoch": 0.850949959860851, "grad_norm": 0.10558971017599106, "kl": 0.005828561610542238, "learning_rate": 2.976445396145611e-06, "loss": 0.0002, "reward": 1.40625, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.9375, "rewards/format_reward_func_qa": 0.46875, "step": 795 }, { "completion_length": 140.0625, "epoch": 0.8520203371688521, "grad_norm": 0.1720445454120636, "kl": 0.011843117186799645, "learning_rate": 2.9550321199143473e-06, "loss": 0.0005, "reward": 1.09375, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 796 }, { "completion_length": 138.375, "epoch": 0.8530907144768531, "grad_norm": 0.40671685338020325, "kl": 0.009810693794861436, "learning_rate": 2.933618843683084e-06, "loss": 0.0004, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 797 }, { "completion_length": 139.5, "epoch": 0.8541610917848541, "grad_norm": 0.009222809225320816, "kl": 0.006754498928785324, "learning_rate": 2.9122055674518203e-06, "loss": 0.0003, "reward": 1.125, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.5, "step": 798 }, { "completion_length": 122.1875, "epoch": 0.8552314690928552, "grad_norm": 0.35098397731781006, "kl": 0.007227005786262453, "learning_rate": 2.8907922912205572e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 799 }, { "completion_length": 119.4375, "epoch": 0.8563018464008563, "grad_norm": 0.01557288970798254, "kl": 0.006504257093183696, "learning_rate": 2.8693790149892937e-06, "loss": 0.0003, "reward": 1.25, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 800 }, { "completion_length": 143.125, "epoch": 0.8573722237088574, "grad_norm": 0.429030179977417, "kl": 0.009496652986854315, "learning_rate": 2.8479657387580302e-06, "loss": 0.0004, "reward": 0.96875, "reward_std": 0.48613589257001877, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.34375, "step": 801 }, { "completion_length": 113.8125, "epoch": 0.8584426010168584, "grad_norm": 0.28019556403160095, "kl": 0.010840977309271693, "learning_rate": 2.826552462526767e-06, "loss": 0.0004, "reward": 1.125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 802 }, { "completion_length": 115.3125, "epoch": 0.8595129783248595, "grad_norm": 0.18566791713237762, "kl": 0.01430837792577222, "learning_rate": 2.8051391862955036e-06, "loss": 0.0006, "reward": 1.03125, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 803 }, { "completion_length": 133.4375, "epoch": 0.8605833556328606, "grad_norm": 0.2564532458782196, "kl": 0.015303282649256289, "learning_rate": 2.78372591006424e-06, "loss": 0.0006, "reward": 0.875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 804 }, { "completion_length": 118.6875, "epoch": 0.8616537329408617, "grad_norm": 0.40447261929512024, "kl": 0.013921299541834742, "learning_rate": 2.7623126338329766e-06, "loss": 0.0006, "reward": 1.03125, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.46875, "step": 805 }, { "completion_length": 138.0, "epoch": 0.8627241102488628, "grad_norm": 0.16943462193012238, "kl": 0.009405800490640104, "learning_rate": 2.7408993576017136e-06, "loss": 0.0004, "reward": 1.3125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 806 }, { "completion_length": 131.125, "epoch": 0.8637944875568638, "grad_norm": 0.012150823138654232, "kl": 0.004822347196750343, "learning_rate": 2.71948608137045e-06, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 807 }, { "completion_length": 126.75, "epoch": 0.8648648648648649, "grad_norm": 0.2634660303592682, "kl": 0.007000271463766694, "learning_rate": 2.6980728051391865e-06, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 808 }, { "completion_length": 121.4375, "epoch": 0.8659352421728659, "grad_norm": 0.27076804637908936, "kl": 0.007159431930631399, "learning_rate": 2.6766595289079235e-06, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.46875, "step": 809 }, { "completion_length": 125.5, "epoch": 0.867005619480867, "grad_norm": 0.36343979835510254, "kl": 0.011766646755859256, "learning_rate": 2.65524625267666e-06, "loss": 0.0005, "reward": 1.125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 810 }, { "completion_length": 135.0, "epoch": 0.868075996788868, "grad_norm": 0.44430816173553467, "kl": 0.012201881152577698, "learning_rate": 2.6338329764453965e-06, "loss": 0.0005, "reward": 1.0625, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 811 }, { "completion_length": 139.4375, "epoch": 0.8691463740968691, "grad_norm": 0.1598687320947647, "kl": 0.0068968687555752695, "learning_rate": 2.6124197002141325e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 812 }, { "completion_length": 127.75, "epoch": 0.8702167514048702, "grad_norm": 0.10236845165491104, "kl": 0.008159077900927514, "learning_rate": 2.59100642398287e-06, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.5, "step": 813 }, { "completion_length": 126.0625, "epoch": 0.8712871287128713, "grad_norm": 0.21709392964839935, "kl": 0.0137818674556911, "learning_rate": 2.5695931477516064e-06, "loss": 0.0006, "reward": 1.25, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 814 }, { "completion_length": 121.4375, "epoch": 0.8723575060208724, "grad_norm": 0.28949427604675293, "kl": 0.006483710138127208, "learning_rate": 2.5481798715203425e-06, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 815 }, { "completion_length": 142.25, "epoch": 0.8734278833288734, "grad_norm": 0.2924516201019287, "kl": 0.007825996144674718, "learning_rate": 2.52676659528908e-06, "loss": 0.0003, "reward": 1.125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 816 }, { "completion_length": 125.75, "epoch": 0.8744982606368745, "grad_norm": 0.26642677187919617, "kl": 0.011669724364764988, "learning_rate": 2.5053533190578163e-06, "loss": 0.0005, "reward": 1.3125, "reward_std": 0.2651650309562683, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.4375, "step": 817 }, { "completion_length": 129.8125, "epoch": 0.8755686379448756, "grad_norm": 0.2429966926574707, "kl": 0.0070472355000674725, "learning_rate": 2.4839400428265524e-06, "loss": 0.0003, "reward": 1.25, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 818 }, { "completion_length": 134.375, "epoch": 0.8766390152528767, "grad_norm": 0.25270265340805054, "kl": 0.007741007721051574, "learning_rate": 2.4625267665952893e-06, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.2651650309562683, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 819 }, { "completion_length": 127.1875, "epoch": 0.8777093925608777, "grad_norm": 0.2912638187408447, "kl": 0.009839336504228413, "learning_rate": 2.441113490364026e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 820 }, { "completion_length": 134.75, "epoch": 0.8787797698688787, "grad_norm": 0.36984801292419434, "kl": 0.010172551847063005, "learning_rate": 2.4197002141327623e-06, "loss": 0.0004, "reward": 1.15625, "reward_std": 0.30935921519994736, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 821 }, { "completion_length": 123.3125, "epoch": 0.8798501471768798, "grad_norm": 0.2561309337615967, "kl": 0.009277017903514206, "learning_rate": 2.398286937901499e-06, "loss": 0.0004, "reward": 1.03125, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.46875, "step": 822 }, { "completion_length": 136.125, "epoch": 0.8809205244848809, "grad_norm": 0.2775847017765045, "kl": 0.00644604314584285, "learning_rate": 2.3768736616702357e-06, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 823 }, { "completion_length": 129.625, "epoch": 0.881990901792882, "grad_norm": 0.35031381249427795, "kl": 0.005350252729840577, "learning_rate": 2.355460385438972e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.6187184229493141, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.3125, "step": 824 }, { "completion_length": 134.875, "epoch": 0.8830612791008831, "grad_norm": 0.39375030994415283, "kl": 0.0112859996734187, "learning_rate": 2.334047109207709e-06, "loss": 0.0005, "reward": 1.0, "reward_std": 0.5303300768136978, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 825 }, { "completion_length": 140.75, "epoch": 0.8841316564088841, "grad_norm": 0.24800661206245422, "kl": 0.009441214380785823, "learning_rate": 2.3126338329764456e-06, "loss": 0.0004, "reward": 1.21875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 826 }, { "completion_length": 123.3125, "epoch": 0.8852020337168852, "grad_norm": 0.12633217871189117, "kl": 0.013467074371874332, "learning_rate": 2.291220556745182e-06, "loss": 0.0005, "reward": 1.21875, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 827 }, { "completion_length": 140.9375, "epoch": 0.8862724110248863, "grad_norm": 0.24753686785697937, "kl": 0.011792593635618687, "learning_rate": 2.2698072805139186e-06, "loss": 0.0005, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 828 }, { "completion_length": 144.1875, "epoch": 0.8873427883328874, "grad_norm": 0.4054880440235138, "kl": 0.007192813092842698, "learning_rate": 2.2483940042826555e-06, "loss": 0.0003, "reward": 0.84375, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.40625, "step": 829 }, { "completion_length": 131.1875, "epoch": 0.8884131656408885, "grad_norm": 0.33562034368515015, "kl": 0.008973661344498396, "learning_rate": 2.226980728051392e-06, "loss": 0.0004, "reward": 0.96875, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 830 }, { "completion_length": 127.6875, "epoch": 0.8894835429488895, "grad_norm": 0.4345308542251587, "kl": 0.006401557242497802, "learning_rate": 2.2055674518201285e-06, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 831 }, { "completion_length": 135.3125, "epoch": 0.8905539202568905, "grad_norm": 0.1333722621202469, "kl": 0.008119619218632579, "learning_rate": 2.1841541755888654e-06, "loss": 0.0003, "reward": 1.3125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 832 }, { "completion_length": 125.0, "epoch": 0.8916242975648916, "grad_norm": 0.23763175308704376, "kl": 0.010645574890077114, "learning_rate": 2.162740899357602e-06, "loss": 0.0004, "reward": 1.21875, "reward_std": 0.39774754643440247, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.40625, "step": 833 }, { "completion_length": 118.9375, "epoch": 0.8926946748728927, "grad_norm": 0.37716400623321533, "kl": 0.009309520362876356, "learning_rate": 2.1413276231263384e-06, "loss": 0.0004, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 834 }, { "completion_length": 133.625, "epoch": 0.8937650521808937, "grad_norm": 0.2157670110464096, "kl": 0.009525050991214812, "learning_rate": 2.119914346895075e-06, "loss": 0.0004, "reward": 0.875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 835 }, { "completion_length": 131.5625, "epoch": 0.8948354294888948, "grad_norm": 0.44609302282333374, "kl": 0.01278557552723214, "learning_rate": 2.098501070663812e-06, "loss": 0.0005, "reward": 1.3125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 836 }, { "completion_length": 144.625, "epoch": 0.8959058067968959, "grad_norm": 0.2067864090204239, "kl": 0.007371830171905458, "learning_rate": 2.0770877944325484e-06, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.22097086906433105, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 837 }, { "completion_length": 123.25, "epoch": 0.896976184104897, "grad_norm": 0.16755704581737518, "kl": 0.008541637100279331, "learning_rate": 2.055674518201285e-06, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 838 }, { "completion_length": 138.375, "epoch": 0.8980465614128981, "grad_norm": 0.3552549481391907, "kl": 0.011229442432522774, "learning_rate": 2.0342612419700218e-06, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.40625, "step": 839 }, { "completion_length": 134.4375, "epoch": 0.8991169387208992, "grad_norm": 0.014936476945877075, "kl": 0.009393009473569691, "learning_rate": 2.0128479657387583e-06, "loss": 0.0004, "reward": 0.9375, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.4375, "step": 840 }, { "completion_length": 137.0, "epoch": 0.9001873160289002, "grad_norm": 0.2491334229707718, "kl": 0.008390806731767952, "learning_rate": 1.9914346895074948e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 841 }, { "completion_length": 131.9375, "epoch": 0.9012576933369013, "grad_norm": 0.42609289288520813, "kl": 0.011231291224248707, "learning_rate": 1.9700214132762313e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 842 }, { "completion_length": 130.25, "epoch": 0.9023280706449023, "grad_norm": 0.1702113300561905, "kl": 0.006513274391181767, "learning_rate": 1.948608137044968e-06, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.40625, "step": 843 }, { "completion_length": 139.5, "epoch": 0.9033984479529034, "grad_norm": 0.18868738412857056, "kl": 0.00926680420525372, "learning_rate": 1.9271948608137047e-06, "loss": 0.0004, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 844 }, { "completion_length": 116.0, "epoch": 0.9044688252609044, "grad_norm": 0.3810214698314667, "kl": 0.008325314964167774, "learning_rate": 1.9057815845824412e-06, "loss": 0.0003, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 845 }, { "completion_length": 135.3125, "epoch": 0.9055392025689055, "grad_norm": 0.3315190374851227, "kl": 0.007074553752318025, "learning_rate": 1.8843683083511779e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.375, "step": 846 }, { "completion_length": 113.25, "epoch": 0.9066095798769066, "grad_norm": 0.2408740520477295, "kl": 0.010330608347430825, "learning_rate": 1.8629550321199144e-06, "loss": 0.0004, "reward": 1.25, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 847 }, { "completion_length": 137.8125, "epoch": 0.9076799571849077, "grad_norm": 0.3844432234764099, "kl": 0.008890569326467812, "learning_rate": 1.841541755888651e-06, "loss": 0.0004, "reward": 0.875, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.4375, "step": 848 }, { "completion_length": 138.3125, "epoch": 0.9087503344929088, "grad_norm": 0.27258604764938354, "kl": 0.012097729369997978, "learning_rate": 1.8201284796573876e-06, "loss": 0.0005, "reward": 0.96875, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 849 }, { "completion_length": 135.4375, "epoch": 0.9098207118009098, "grad_norm": 0.499746710062027, "kl": 0.008218799135647714, "learning_rate": 1.7987152034261243e-06, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.4419417344033718, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 850 }, { "completion_length": 133.125, "epoch": 0.9108910891089109, "grad_norm": 0.20553438365459442, "kl": 0.011194521095603704, "learning_rate": 1.777301927194861e-06, "loss": 0.0004, "reward": 1.09375, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 851 }, { "completion_length": 123.75, "epoch": 0.911961466416912, "grad_norm": 0.15554460883140564, "kl": 0.016580003080889583, "learning_rate": 1.7558886509635975e-06, "loss": 0.0007, "reward": 1.3125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 852 }, { "completion_length": 137.25, "epoch": 0.9130318437249131, "grad_norm": 0.3284795880317688, "kl": 0.010739043122157454, "learning_rate": 1.7344753747323342e-06, "loss": 0.0004, "reward": 1.125, "reward_std": 0.3535533882677555, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 853 }, { "completion_length": 131.625, "epoch": 0.914102221032914, "grad_norm": 0.306974858045578, "kl": 0.014209156390279531, "learning_rate": 1.7130620985010707e-06, "loss": 0.0006, "reward": 0.625, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.3125, "step": 854 }, { "completion_length": 140.25, "epoch": 0.9151725983409151, "grad_norm": 0.3392552137374878, "kl": 0.006655365345068276, "learning_rate": 1.6916488222698074e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.40625, "step": 855 }, { "completion_length": 138.5625, "epoch": 0.9162429756489162, "grad_norm": 0.39303335547447205, "kl": 0.007140449131838977, "learning_rate": 1.6702355460385441e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 856 }, { "completion_length": 141.6875, "epoch": 0.9173133529569173, "grad_norm": 0.406084269285202, "kl": 0.008715409378055483, "learning_rate": 1.6488222698072806e-06, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.48613590374588966, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 857 }, { "completion_length": 133.25, "epoch": 0.9183837302649184, "grad_norm": 0.42976242303848267, "kl": 0.02261003595776856, "learning_rate": 1.6274089935760173e-06, "loss": 0.0009, "reward": 1.15625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.46875, "step": 858 }, { "completion_length": 134.4375, "epoch": 0.9194541075729195, "grad_norm": 0.1970226913690567, "kl": 0.007975909684319049, "learning_rate": 1.6059957173447538e-06, "loss": 0.0003, "reward": 1.3125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 859 }, { "completion_length": 123.6875, "epoch": 0.9205244848809205, "grad_norm": 0.1540532112121582, "kl": 0.009718539658933878, "learning_rate": 1.5845824411134905e-06, "loss": 0.0004, "reward": 1.21875, "reward_std": 0.04419417306780815, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 860 }, { "completion_length": 134.3125, "epoch": 0.9215948621889216, "grad_norm": 0.3236163854598999, "kl": 0.009357965551316738, "learning_rate": 1.563169164882227e-06, "loss": 0.0004, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 861 }, { "completion_length": 136.0, "epoch": 0.9226652394969227, "grad_norm": 0.38115808367729187, "kl": 0.007043075864203274, "learning_rate": 1.5417558886509637e-06, "loss": 0.0003, "reward": 1.3125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.5, "step": 862 }, { "completion_length": 142.6875, "epoch": 0.9237356168049238, "grad_norm": 0.44823506474494934, "kl": 0.005673856125213206, "learning_rate": 1.5203426124197005e-06, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.34375, "step": 863 }, { "completion_length": 118.6875, "epoch": 0.9248059941129249, "grad_norm": 0.1748836636543274, "kl": 0.006746939034201205, "learning_rate": 1.498929336188437e-06, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 864 }, { "completion_length": 131.25, "epoch": 0.9258763714209258, "grad_norm": 0.4397515058517456, "kl": 0.011117359041236341, "learning_rate": 1.4775160599571737e-06, "loss": 0.0004, "reward": 0.9375, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.4375, "step": 865 }, { "completion_length": 126.8125, "epoch": 0.9269467487289269, "grad_norm": 0.2283439040184021, "kl": 0.005676885601133108, "learning_rate": 1.4561027837259102e-06, "loss": 0.0002, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.5, "step": 866 }, { "completion_length": 143.5, "epoch": 0.928017126036928, "grad_norm": 0.36612096428871155, "kl": 0.009511335520073771, "learning_rate": 1.4346895074946469e-06, "loss": 0.0004, "reward": 1.09375, "reward_std": 0.39774756133556366, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 867 }, { "completion_length": 137.0625, "epoch": 0.9290875033449291, "grad_norm": 0.3578431308269501, "kl": 0.007443786482326686, "learning_rate": 1.4132762312633836e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.4375, "step": 868 }, { "completion_length": 142.875, "epoch": 0.9301578806529301, "grad_norm": 0.39885127544403076, "kl": 0.00724770815577358, "learning_rate": 1.39186295503212e-06, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 869 }, { "completion_length": 142.75, "epoch": 0.9312282579609312, "grad_norm": 0.2964957356452942, "kl": 0.008153350790962577, "learning_rate": 1.3704496788008568e-06, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 870 }, { "completion_length": 124.875, "epoch": 0.9322986352689323, "grad_norm": 0.35611847043037415, "kl": 0.009779923013411462, "learning_rate": 1.3490364025695933e-06, "loss": 0.0004, "reward": 1.0625, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 871 }, { "completion_length": 135.9375, "epoch": 0.9333690125769334, "grad_norm": 0.3712056577205658, "kl": 0.012457122909836471, "learning_rate": 1.32762312633833e-06, "loss": 0.0005, "reward": 1.03125, "reward_std": 0.39774756133556366, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 872 }, { "completion_length": 127.75, "epoch": 0.9344393898849345, "grad_norm": 0.378439337015152, "kl": 0.008547072182409465, "learning_rate": 1.3062098501070663e-06, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 873 }, { "completion_length": 134.6875, "epoch": 0.9355097671929355, "grad_norm": 0.25466203689575195, "kl": 0.009291433147154748, "learning_rate": 1.2847965738758032e-06, "loss": 0.0004, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 874 }, { "completion_length": 121.5, "epoch": 0.9365801445009366, "grad_norm": 0.4017687439918518, "kl": 0.007305440027266741, "learning_rate": 1.26338329764454e-06, "loss": 0.0003, "reward": 1.0, "reward_std": 0.2651650309562683, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 875 }, { "completion_length": 126.0, "epoch": 0.9376505218089376, "grad_norm": 0.3280426263809204, "kl": 0.007912043947726488, "learning_rate": 1.2419700214132762e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.39774756133556366, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.40625, "step": 876 }, { "completion_length": 131.625, "epoch": 0.9387208991169387, "grad_norm": 0.42736512422561646, "kl": 0.012609689962118864, "learning_rate": 1.220556745182013e-06, "loss": 0.0005, "reward": 0.96875, "reward_std": 0.39774755761027336, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 877 }, { "completion_length": 132.0625, "epoch": 0.9397912764249398, "grad_norm": 0.2932626008987427, "kl": 0.010477858711965382, "learning_rate": 1.1991434689507496e-06, "loss": 0.0004, "reward": 1.0625, "reward_std": 0.3535533808171749, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.375, "step": 878 }, { "completion_length": 128.1875, "epoch": 0.9408616537329408, "grad_norm": 0.2469029426574707, "kl": 0.00843798543792218, "learning_rate": 1.177730192719486e-06, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.3125, "rewards/format_reward_func_qa": 0.34375, "step": 879 }, { "completion_length": 125.6875, "epoch": 0.9419320310409419, "grad_norm": 0.3960351347923279, "kl": 0.005910314852371812, "learning_rate": 1.1563169164882228e-06, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 880 }, { "completion_length": 147.9375, "epoch": 0.943002408348943, "grad_norm": 0.3404659330844879, "kl": 0.00636070198379457, "learning_rate": 1.1349036402569593e-06, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.39774756133556366, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.46875, "step": 881 }, { "completion_length": 132.6875, "epoch": 0.9440727856569441, "grad_norm": 0.2074422687292099, "kl": 0.01003689062781632, "learning_rate": 1.113490364025696e-06, "loss": 0.0004, "reward": 1.28125, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 882 }, { "completion_length": 135.375, "epoch": 0.9451431629649452, "grad_norm": 0.3034646809101105, "kl": 0.011570366565138102, "learning_rate": 1.0920770877944327e-06, "loss": 0.0005, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 883 }, { "completion_length": 139.25, "epoch": 0.9462135402729462, "grad_norm": 0.29440754652023315, "kl": 0.008864334551617503, "learning_rate": 1.0706638115631692e-06, "loss": 0.0004, "reward": 1.0625, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.5, "step": 884 }, { "completion_length": 146.5, "epoch": 0.9472839175809473, "grad_norm": 0.3923928141593933, "kl": 0.009240275248885155, "learning_rate": 1.049250535331906e-06, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.39774755015969276, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 885 }, { "completion_length": 128.75, "epoch": 0.9483542948889484, "grad_norm": 0.256527841091156, "kl": 0.008802727796137333, "learning_rate": 1.0278372591006424e-06, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.40625, "step": 886 }, { "completion_length": 130.3125, "epoch": 0.9494246721969494, "grad_norm": 0.43096697330474854, "kl": 0.009572853799909353, "learning_rate": 1.0064239828693791e-06, "loss": 0.0004, "reward": 1.28125, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 887 }, { "completion_length": 141.4375, "epoch": 0.9504950495049505, "grad_norm": 0.5193402767181396, "kl": 0.007065376383252442, "learning_rate": 9.850107066381156e-07, "loss": 0.0003, "reward": 1.0, "reward_std": 0.5303300805389881, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 888 }, { "completion_length": 150.0, "epoch": 0.9515654268129515, "grad_norm": 0.2323281466960907, "kl": 0.0101603310322389, "learning_rate": 9.635974304068523e-07, "loss": 0.0004, "reward": 0.78125, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.34375, "step": 889 }, { "completion_length": 136.6875, "epoch": 0.9526358041209526, "grad_norm": 0.43325403332710266, "kl": 0.008027750882320106, "learning_rate": 9.421841541755889e-07, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.5, "step": 890 }, { "completion_length": 144.4375, "epoch": 0.9537061814289537, "grad_norm": 0.41024377942085266, "kl": 0.014538660296238959, "learning_rate": 9.207708779443255e-07, "loss": 0.0006, "reward": 0.96875, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.34375, "step": 891 }, { "completion_length": 145.375, "epoch": 0.9547765587369548, "grad_norm": 0.3636007010936737, "kl": 0.011141704628244042, "learning_rate": 8.993576017130621e-07, "loss": 0.0004, "reward": 1.03125, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 892 }, { "completion_length": 128.8125, "epoch": 0.9558469360449559, "grad_norm": 0.40895262360572815, "kl": 0.011309466208331287, "learning_rate": 8.779443254817988e-07, "loss": 0.0005, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 893 }, { "completion_length": 124.0, "epoch": 0.9569173133529569, "grad_norm": 0.14040705561637878, "kl": 0.014053698279894888, "learning_rate": 8.565310492505354e-07, "loss": 0.0006, "reward": 1.125, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 894 }, { "completion_length": 139.6875, "epoch": 0.957987690660958, "grad_norm": 0.29805439710617065, "kl": 0.014652458485215902, "learning_rate": 8.351177730192721e-07, "loss": 0.0006, "reward": 1.125, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.5, "step": 895 }, { "completion_length": 112.8125, "epoch": 0.9590580679689591, "grad_norm": 0.2931126058101654, "kl": 0.032070288667455316, "learning_rate": 8.137044967880087e-07, "loss": 0.0013, "reward": 1.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 896 }, { "completion_length": 116.6875, "epoch": 0.9601284452769602, "grad_norm": 0.419941246509552, "kl": 0.010610463563352823, "learning_rate": 7.922912205567453e-07, "loss": 0.0004, "reward": 1.0625, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 897 }, { "completion_length": 150.0, "epoch": 0.9611988225849611, "grad_norm": 0.3349963426589966, "kl": 0.01056701224297285, "learning_rate": 7.708779443254819e-07, "loss": 0.0004, "reward": 0.96875, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 898 }, { "completion_length": 143.9375, "epoch": 0.9622691998929622, "grad_norm": 0.3059155344963074, "kl": 0.005418091081082821, "learning_rate": 7.494646680942185e-07, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 899 }, { "completion_length": 142.5625, "epoch": 0.9633395772009633, "grad_norm": 0.310170978307724, "kl": 0.0053399253520183265, "learning_rate": 7.280513918629551e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.3125, "step": 900 }, { "completion_length": 139.6875, "epoch": 0.9644099545089644, "grad_norm": 0.27191832661628723, "kl": 0.008772899280302227, "learning_rate": 7.066381156316918e-07, "loss": 0.0004, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 901 }, { "completion_length": 115.4375, "epoch": 0.9654803318169655, "grad_norm": 0.3773668706417084, "kl": 0.008046950912103057, "learning_rate": 6.852248394004284e-07, "loss": 0.0003, "reward": 1.125, "reward_std": 0.2651650384068489, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.4375, "step": 902 }, { "completion_length": 124.4375, "epoch": 0.9665507091249665, "grad_norm": 0.2622586488723755, "kl": 0.01882827968802303, "learning_rate": 6.63811563169165e-07, "loss": 0.0008, "reward": 1.15625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.40625, "step": 903 }, { "completion_length": 131.4375, "epoch": 0.9676210864329676, "grad_norm": 0.3436194658279419, "kl": 0.008304660208523273, "learning_rate": 6.423982869379016e-07, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.22097086533904076, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.46875, "step": 904 }, { "completion_length": 149.0, "epoch": 0.9686914637409687, "grad_norm": 0.35091543197631836, "kl": 0.007805566652677953, "learning_rate": 6.209850107066381e-07, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.4419417306780815, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 905 }, { "completion_length": 128.4375, "epoch": 0.9697618410489698, "grad_norm": 0.020937882363796234, "kl": 0.009758950909599662, "learning_rate": 5.995717344753748e-07, "loss": 0.0004, "reward": 1.375, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.5, "step": 906 }, { "completion_length": 128.625, "epoch": 0.9708322183569709, "grad_norm": 0.2938354015350342, "kl": 0.007473053061403334, "learning_rate": 5.781584582441114e-07, "loss": 0.0003, "reward": 1.3125, "reward_std": 0.2651650421321392, "rewards/correctness_reward_func_qa": 0.875, "rewards/format_reward_func_qa": 0.4375, "step": 907 }, { "completion_length": 124.125, "epoch": 0.971902595664972, "grad_norm": 0.2342502474784851, "kl": 0.016936437925323844, "learning_rate": 5.56745182012848e-07, "loss": 0.0007, "reward": 1.09375, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 908 }, { "completion_length": 135.125, "epoch": 0.972972972972973, "grad_norm": 0.22863703966140747, "kl": 0.005928155849687755, "learning_rate": 5.353319057815846e-07, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.25, "step": 909 }, { "completion_length": 137.4375, "epoch": 0.974043350280974, "grad_norm": 0.32250818610191345, "kl": 0.01202561764512211, "learning_rate": 5.139186295503212e-07, "loss": 0.0005, "reward": 1.03125, "reward_std": 0.22097086161375046, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.34375, "step": 910 }, { "completion_length": 131.9375, "epoch": 0.9751137275889751, "grad_norm": 0.4060610830783844, "kl": 0.015742171090096235, "learning_rate": 4.925053533190578e-07, "loss": 0.0006, "reward": 0.71875, "reward_std": 0.39774755388498306, "rewards/correctness_reward_func_qa": 0.375, "rewards/format_reward_func_qa": 0.34375, "step": 911 }, { "completion_length": 116.25, "epoch": 0.9761841048969762, "grad_norm": 0.5060104727745056, "kl": 0.008879208355210721, "learning_rate": 4.7109207708779447e-07, "loss": 0.0004, "reward": 0.96875, "reward_std": 0.39774755761027336, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.40625, "step": 912 }, { "completion_length": 123.625, "epoch": 0.9772544822049772, "grad_norm": 0.292103111743927, "kl": 0.010586361167952418, "learning_rate": 4.496788008565311e-07, "loss": 0.0004, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.5, "step": 913 }, { "completion_length": 141.1875, "epoch": 0.9783248595129783, "grad_norm": 0.35570940375328064, "kl": 0.010702368919737637, "learning_rate": 4.282655246252677e-07, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.30935920774936676, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 914 }, { "completion_length": 140.5, "epoch": 0.9793952368209794, "grad_norm": 0.2861424684524536, "kl": 0.006700944853946567, "learning_rate": 4.0685224839400433e-07, "loss": 0.0003, "reward": 0.875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.375, "step": 915 }, { "completion_length": 127.8125, "epoch": 0.9804656141289805, "grad_norm": 0.22765056788921356, "kl": 0.007059081457555294, "learning_rate": 3.8543897216274094e-07, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.4375, "step": 916 }, { "completion_length": 125.125, "epoch": 0.9815359914369816, "grad_norm": 0.3078356981277466, "kl": 0.007385566714219749, "learning_rate": 3.6402569593147754e-07, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 917 }, { "completion_length": 141.3125, "epoch": 0.9826063687449826, "grad_norm": 0.22258876264095306, "kl": 0.00844906794372946, "learning_rate": 3.426124197002142e-07, "loss": 0.0003, "reward": 1.0, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 918 }, { "completion_length": 143.6875, "epoch": 0.9836767460529837, "grad_norm": 0.41988077759742737, "kl": 0.015501756453886628, "learning_rate": 3.211991434689508e-07, "loss": 0.0006, "reward": 1.03125, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 919 }, { "completion_length": 141.625, "epoch": 0.9847471233609848, "grad_norm": 0.2571990191936493, "kl": 0.007820777944289148, "learning_rate": 2.997858672376874e-07, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 920 }, { "completion_length": 129.5, "epoch": 0.9858175006689858, "grad_norm": 0.23291267454624176, "kl": 0.008339556341525167, "learning_rate": 2.78372591006424e-07, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.34375, "step": 921 }, { "completion_length": 128.6875, "epoch": 0.9868878779769868, "grad_norm": 0.130796417593956, "kl": 0.005826088017784059, "learning_rate": 2.569593147751606e-07, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.13258251547813416, "rewards/correctness_reward_func_qa": 0.6875, "rewards/format_reward_func_qa": 0.40625, "step": 922 }, { "completion_length": 147.125, "epoch": 0.9879582552849879, "grad_norm": 0.01319180428981781, "kl": 0.008099900209344923, "learning_rate": 2.3554603854389724e-07, "loss": 0.0003, "reward": 1.0, "reward_std": 0.0, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.375, "step": 923 }, { "completion_length": 148.625, "epoch": 0.989028632592989, "grad_norm": 0.4370482265949249, "kl": 0.008805748075246811, "learning_rate": 2.1413276231263384e-07, "loss": 0.0004, "reward": 1.03125, "reward_std": 0.39774756133556366, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.40625, "step": 924 }, { "completion_length": 118.1875, "epoch": 0.9900990099009901, "grad_norm": 0.2641826272010803, "kl": 0.007795166107825935, "learning_rate": 1.9271948608137047e-07, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 925 }, { "completion_length": 142.4375, "epoch": 0.9911693872089912, "grad_norm": 0.12282008677721024, "kl": 0.008558577275834978, "learning_rate": 1.713062098501071e-07, "loss": 0.0003, "reward": 1.25, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.4375, "step": 926 }, { "completion_length": 113.5, "epoch": 0.9922397645169923, "grad_norm": 0.30356624722480774, "kl": 0.007500700594391674, "learning_rate": 1.498929336188437e-07, "loss": 0.0003, "reward": 0.75, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func_qa": 0.4375, "rewards/format_reward_func_qa": 0.3125, "step": 927 }, { "completion_length": 146.375, "epoch": 0.9933101418249933, "grad_norm": 0.5263656973838806, "kl": 0.008850720943883061, "learning_rate": 1.284796573875803e-07, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.30935921147465706, "rewards/correctness_reward_func_qa": 0.5, "rewards/format_reward_func_qa": 0.34375, "step": 928 }, { "completion_length": 129.1875, "epoch": 0.9943805191329944, "grad_norm": 0.37286561727523804, "kl": 0.011664152727462351, "learning_rate": 1.0706638115631692e-07, "loss": 0.0005, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.4375, "step": 929 }, { "completion_length": 139.1875, "epoch": 0.9954508964409955, "grad_norm": 0.2327730804681778, "kl": 0.011496033694129437, "learning_rate": 8.565310492505355e-08, "loss": 0.0005, "reward": 1.28125, "reward_std": 0.13258251920342445, "rewards/correctness_reward_func_qa": 0.8125, "rewards/format_reward_func_qa": 0.46875, "step": 930 }, { "completion_length": 129.25, "epoch": 0.9965212737489966, "grad_norm": 0.32198476791381836, "kl": 0.02556618954986334, "learning_rate": 6.423982869379015e-08, "loss": 0.001, "reward": 1.15625, "reward_std": 0.48613589257001877, "rewards/correctness_reward_func_qa": 0.75, "rewards/format_reward_func_qa": 0.40625, "step": 931 }, { "completion_length": 134.3125, "epoch": 0.9975916510569975, "grad_norm": 0.36084499955177307, "kl": 0.007645938079804182, "learning_rate": 4.2826552462526774e-08, "loss": 0.0003, "reward": 1.0, "reward_std": 0.2651650346815586, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.4375, "step": 932 }, { "completion_length": 126.3125, "epoch": 0.9986620283649986, "grad_norm": 0.31160324811935425, "kl": 0.008126129629090428, "learning_rate": 2.1413276231263387e-08, "loss": 0.0003, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/correctness_reward_func_qa": 0.625, "rewards/format_reward_func_qa": 0.5, "step": 933 }, { "completion_length": 141.625, "epoch": 0.9997324056729997, "grad_norm": 0.32341817021369934, "kl": 0.011580780032090843, "learning_rate": 0.0, "loss": 0.0005, "reward": 0.9375, "reward_std": 0.1767766885459423, "rewards/correctness_reward_func_qa": 0.5625, "rewards/format_reward_func_qa": 0.375, "step": 934 } ], "logging_steps": 1, "max_steps": 934, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }