{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.005574979492039726, "eval_steps": 500, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 739.7375, "epoch": 7.964256417199609e-05, "grad_norm": 0.17606443166732788, "kl": 0.0006033612473402173, "learning_rate": 6.25e-07, "loss": 0.0, "reward": -3.3789249688386915, "reward_std": 1.234160715341568, "rewards/custom_reward_logic_v2": -3.3789249688386915, "step": 10 }, { "completion_length": 881.3875, "epoch": 0.00015928512834399218, "grad_norm": 0.1873437762260437, "kl": 0.0007326043589273468, "learning_rate": 1.25e-06, "loss": 0.0, "reward": -4.281049972772598, "reward_std": 1.4586432427167892, "rewards/custom_reward_logic_v2": -4.281049972772598, "step": 20 }, { "completion_length": 798.325, "epoch": 0.00023892769251598824, "grad_norm": 0.16695357859134674, "kl": 0.0007708041899604723, "learning_rate": 1.8750000000000003e-06, "loss": 0.0, "reward": -3.79504998922348, "reward_std": 1.4004287779331208, "rewards/custom_reward_logic_v2": -3.79504998922348, "step": 30 }, { "completion_length": 885.76875, "epoch": 0.00031857025668798435, "grad_norm": 0.18757909536361694, "kl": 0.0007291340152733028, "learning_rate": 2.5e-06, "loss": 0.0, "reward": -3.8223875135183336, "reward_std": 1.053759826719761, "rewards/custom_reward_logic_v2": -3.8223875135183336, "step": 40 }, { "completion_length": 806.35625, "epoch": 0.0003982128208599804, "grad_norm": 0.1678084433078766, "kl": 0.000756343750981614, "learning_rate": 3.125e-06, "loss": 0.0, "reward": -3.831325000524521, "reward_std": 1.2611359059810638, "rewards/custom_reward_logic_v2": -3.831325000524521, "step": 50 }, { "completion_length": 860.95, "epoch": 0.0004778553850319765, "grad_norm": 0.16106949746608734, "kl": 0.0007545762317022308, "learning_rate": 3.7500000000000005e-06, "loss": 0.0, "reward": -3.99547501206398, "reward_std": 1.233138319849968, "rewards/custom_reward_logic_v2": -3.99547501206398, "step": 60 }, { "completion_length": 831.175, "epoch": 0.0005574979492039726, "grad_norm": 0.1723652333021164, "kl": 0.0007971685263328254, "learning_rate": 4.3750000000000005e-06, "loss": 0.0, "reward": -4.036549943685531, "reward_std": 1.5394920334219933, "rewards/custom_reward_logic_v2": -4.036549943685531, "step": 70 }, { "completion_length": 874.325, "epoch": 0.0006371405133759687, "grad_norm": 0.2079666703939438, "kl": 0.0008876581850927323, "learning_rate": 5e-06, "loss": 0.0, "reward": -3.92242501154542, "reward_std": 1.2604085817933082, "rewards/custom_reward_logic_v2": -3.92242501154542, "step": 80 }, { "completion_length": 791.91875, "epoch": 0.0007167830775479647, "grad_norm": 0.16253575682640076, "kl": 0.0010255174711346626, "learning_rate": 4.997620553954645e-06, "loss": 0.0, "reward": -3.364587500691414, "reward_std": 1.2228698313236237, "rewards/custom_reward_logic_v2": -3.364587500691414, "step": 90 }, { "completion_length": 846.2875, "epoch": 0.0007964256417199608, "grad_norm": 0.18019770085811615, "kl": 0.0013353260728763416, "learning_rate": 4.990486745229364e-06, "loss": 0.0001, "reward": -3.805912530422211, "reward_std": 1.2458222389221192, "rewards/custom_reward_logic_v2": -3.805912530422211, "step": 100 }, { "completion_length": 729.04375, "epoch": 0.0008760682058919569, "grad_norm": 0.1617293506860733, "kl": 0.0018967354553751647, "learning_rate": 4.978612153434527e-06, "loss": 0.0001, "reward": -3.071175017207861, "reward_std": 1.3524149775505065, "rewards/custom_reward_logic_v2": -3.071175017207861, "step": 110 }, { "completion_length": 641.74375, "epoch": 0.000955710770063953, "grad_norm": 0.26608461141586304, "kl": 0.0029119997401721776, "learning_rate": 4.962019382530521e-06, "loss": 0.0001, "reward": -2.690687493979931, "reward_std": 1.0754198133945465, "rewards/custom_reward_logic_v2": -2.690687493979931, "step": 120 }, { "completion_length": 883.9875, "epoch": 0.001035353334235949, "grad_norm": 0.7612231373786926, "kl": 0.003597881377208978, "learning_rate": 4.9407400177998335e-06, "loss": 0.0001, "reward": -3.8035999715328215, "reward_std": 1.2502110481262207, "rewards/custom_reward_logic_v2": -3.8035999715328215, "step": 130 }, { "completion_length": 524.48125, "epoch": 0.0011149958984079452, "grad_norm": 0.9543402791023254, "kl": 0.08978197913384064, "learning_rate": 4.914814565722671e-06, "loss": 0.0036, "reward": -2.0596874909475447, "reward_std": 1.3678732179105282, "rewards/custom_reward_logic_v2": -2.0596874909475447, "step": 140 }, { "completion_length": 46.43125, "epoch": 0.0011946384625799412, "grad_norm": 0.7850804328918457, "kl": 0.3018287725746632, "learning_rate": 4.884292376870567e-06, "loss": 0.0121, "reward": -0.04024999849498272, "reward_std": 0.430637900531292, "rewards/custom_reward_logic_v2": -0.04024999849498272, "step": 150 }, { "completion_length": 18.93125, "epoch": 0.0012742810267519374, "grad_norm": 0.032512303441762924, "kl": 0.33459745422005654, "learning_rate": 4.849231551964771e-06, "loss": 0.0134, "reward": 0.1650000035762787, "reward_std": 0.07605109438300132, "rewards/custom_reward_logic_v2": 0.1650000035762787, "step": 160 }, { "completion_length": 17.4375, "epoch": 0.0013539235909239334, "grad_norm": 0.02004638873040676, "kl": 0.35064528286457064, "learning_rate": 4.809698831278217e-06, "loss": 0.014, "reward": 0.08999999985098839, "reward_std": 0.125558003783226, "rewards/custom_reward_logic_v2": 0.08999999985098839, "step": 170 }, { "completion_length": 23.975, "epoch": 0.0014335661550959294, "grad_norm": 0.2281995564699173, "kl": 0.3118164837360382, "learning_rate": 4.765769467591626e-06, "loss": 0.0125, "reward": 0.08099999986588954, "reward_std": 0.18301311507821083, "rewards/custom_reward_logic_v2": 0.08099999986588954, "step": 180 }, { "completion_length": 17.7, "epoch": 0.0015132087192679256, "grad_norm": 0.20832708477973938, "kl": 0.34881954491138456, "learning_rate": 4.717527082945555e-06, "loss": 0.014, "reward": 0.14687500111758708, "reward_std": 0.13193419948220253, "rewards/custom_reward_logic_v2": 0.14687500111758708, "step": 190 }, { "completion_length": 28.76875, "epoch": 0.0015928512834399217, "grad_norm": 0.2148224264383316, "kl": 0.4086977861821651, "learning_rate": 4.665063509461098e-06, "loss": 0.0163, "reward": 0.06411250084638595, "reward_std": 0.09681975245475768, "rewards/custom_reward_logic_v2": 0.06411250084638595, "step": 200 }, { "completion_length": 19.3, "epoch": 0.0016724938476119177, "grad_norm": 0.03454764187335968, "kl": 0.3337583176791668, "learning_rate": 4.608478614532215e-06, "loss": 0.0134, "reward": 0.21312500052154065, "reward_std": 0.1542310357093811, "rewards/custom_reward_logic_v2": 0.21312500052154065, "step": 210 }, { "completion_length": 48.96875, "epoch": 0.0017521364117839139, "grad_norm": 0.8877259492874146, "kl": 0.3230514988303185, "learning_rate": 4.54788011072248e-06, "loss": 0.0129, "reward": -0.12147499993443489, "reward_std": 0.4157312333583832, "rewards/custom_reward_logic_v2": -0.12147499993443489, "step": 220 }, { "completion_length": 19.41875, "epoch": 0.0018317789759559099, "grad_norm": 0.7465932369232178, "kl": 0.32680382803082464, "learning_rate": 4.4833833507280884e-06, "loss": 0.0131, "reward": 0.14000000059604645, "reward_std": 0.09731742069125175, "rewards/custom_reward_logic_v2": 0.14000000059604645, "step": 230 }, { "completion_length": 23.425, "epoch": 0.001911421540127906, "grad_norm": 0.4111487567424774, "kl": 0.3509559452533722, "learning_rate": 4.415111107797445e-06, "loss": 0.014, "reward": 0.18286250159144402, "reward_std": 0.1811980500817299, "rewards/custom_reward_logic_v2": 0.18286250159144402, "step": 240 }, { "completion_length": 18.91875, "epoch": 0.001991064104299902, "grad_norm": 0.8882763385772705, "kl": 0.3525215476751328, "learning_rate": 4.34319334202531e-06, "loss": 0.0141, "reward": 0.17062499970197678, "reward_std": 0.11504097878932953, "rewards/custom_reward_logic_v2": 0.17062499970197678, "step": 250 }, { "completion_length": 20.475, "epoch": 0.002070706668471898, "grad_norm": 0.03645075112581253, "kl": 0.3291649468243122, "learning_rate": 4.267766952966369e-06, "loss": 0.0132, "reward": 0.16500000059604644, "reward_std": 0.1858065977692604, "rewards/custom_reward_logic_v2": 0.16500000059604644, "step": 260 }, { "completion_length": 24.5625, "epoch": 0.0021503492326438944, "grad_norm": 1.1167131662368774, "kl": 0.33756194859743116, "learning_rate": 4.188975519039151e-06, "loss": 0.0135, "reward": 0.10505000110715627, "reward_std": 0.0828484557569027, "rewards/custom_reward_logic_v2": 0.10505000110715627, "step": 270 }, { "completion_length": 19.925, "epoch": 0.0022299917968158904, "grad_norm": 0.8635123372077942, "kl": 0.32979664355516436, "learning_rate": 4.106969024216348e-06, "loss": 0.0132, "reward": 0.20062500163912772, "reward_std": 0.1258012667298317, "rewards/custom_reward_logic_v2": 0.20062500163912772, "step": 280 }, { "completion_length": 41.35625, "epoch": 0.0023096343609878864, "grad_norm": 0.7731335163116455, "kl": 0.29900490418076514, "learning_rate": 4.021903572521802e-06, "loss": 0.012, "reward": 0.13356250263750552, "reward_std": 0.11855373680591583, "rewards/custom_reward_logic_v2": 0.13356250263750552, "step": 290 }, { "completion_length": 21.425, "epoch": 0.0023892769251598824, "grad_norm": 0.050558220595121384, "kl": 0.30905950888991357, "learning_rate": 3.933941090877615e-06, "loss": 0.0124, "reward": 0.10625000111758709, "reward_std": 0.07851103022694587, "rewards/custom_reward_logic_v2": 0.10625000111758709, "step": 300 }, { "completion_length": 28.39375, "epoch": 0.0024689194893318784, "grad_norm": 1.2737127542495728, "kl": 0.3259002223610878, "learning_rate": 3.8432490208670605e-06, "loss": 0.013, "reward": 0.07012500055134296, "reward_std": 0.21550666987895967, "rewards/custom_reward_logic_v2": 0.07012500055134296, "step": 310 }, { "completion_length": 20.49375, "epoch": 0.002548562053503875, "grad_norm": 1.3667010068893433, "kl": 0.32961594611406325, "learning_rate": 3.7500000000000005e-06, "loss": 0.0132, "reward": 0.15562500059604645, "reward_std": 0.14379026368260384, "rewards/custom_reward_logic_v2": 0.15562500059604645, "step": 320 }, { "completion_length": 23.7625, "epoch": 0.002628204617675871, "grad_norm": 0.9662195444107056, "kl": 0.3291011206805706, "learning_rate": 3.654371533087586e-06, "loss": 0.0132, "reward": 0.20617500003427267, "reward_std": 0.12530190348625184, "rewards/custom_reward_logic_v2": 0.20617500003427267, "step": 330 }, { "completion_length": 19.15, "epoch": 0.002707847181847867, "grad_norm": 2.964785099029541, "kl": 0.3629206448793411, "learning_rate": 3.556545654351749e-06, "loss": 0.0145, "reward": 0.10437500067055225, "reward_std": 0.12071752324700355, "rewards/custom_reward_logic_v2": 0.10437500067055225, "step": 340 }, { "completion_length": 20.4875, "epoch": 0.002787489746019863, "grad_norm": 1.0044533014297485, "kl": 0.3254102662205696, "learning_rate": 3.4567085809127247e-06, "loss": 0.013, "reward": 0.15562499798834323, "reward_std": 0.15355074554681777, "rewards/custom_reward_logic_v2": 0.15562499798834323, "step": 350 }, { "completion_length": 21.21875, "epoch": 0.002867132310191859, "grad_norm": 0.8673160672187805, "kl": 0.328788036108017, "learning_rate": 3.3550503583141726e-06, "loss": 0.0132, "reward": 0.22808750197291375, "reward_std": 0.14038661643862724, "rewards/custom_reward_logic_v2": 0.22808750197291375, "step": 360 }, { "completion_length": 18.29375, "epoch": 0.002946774874363855, "grad_norm": 1.275578260421753, "kl": 0.3586613781750202, "learning_rate": 3.2517644987606827e-06, "loss": 0.0143, "reward": 0.09437500052154064, "reward_std": 0.13283729180693626, "rewards/custom_reward_logic_v2": 0.09437500052154064, "step": 370 }, { "completion_length": 19.625, "epoch": 0.0030264174385358513, "grad_norm": 1.135249376296997, "kl": 0.3399433046579361, "learning_rate": 3.147047612756302e-06, "loss": 0.0136, "reward": 0.18000000156462193, "reward_std": 0.1102687232196331, "rewards/custom_reward_logic_v2": 0.18000000156462193, "step": 380 }, { "completion_length": 18.65625, "epoch": 0.0031060600027078473, "grad_norm": 0.0214656013995409, "kl": 0.3453727260231972, "learning_rate": 3.0410990348452572e-06, "loss": 0.0138, "reward": 0.14312500059604644, "reward_std": 0.21185824573040007, "rewards/custom_reward_logic_v2": 0.14312500059604644, "step": 390 }, { "completion_length": 22.5875, "epoch": 0.0031857025668798433, "grad_norm": 1.1392817497253418, "kl": 0.3561431519687176, "learning_rate": 2.9341204441673267e-06, "loss": 0.0142, "reward": 0.09312500022351741, "reward_std": 0.09467698186635971, "rewards/custom_reward_logic_v2": 0.09312500022351741, "step": 400 }, { "completion_length": 21.9125, "epoch": 0.0032653451310518393, "grad_norm": 0.12919628620147705, "kl": 0.3515960440039635, "learning_rate": 2.82631548055013e-06, "loss": 0.0141, "reward": 0.07376250103116036, "reward_std": 0.15707473903894426, "rewards/custom_reward_logic_v2": 0.07376250103116036, "step": 410 }, { "completion_length": 20.325, "epoch": 0.0033449876952238353, "grad_norm": 0.08202961087226868, "kl": 0.34852803200483323, "learning_rate": 2.717889356869146e-06, "loss": 0.0139, "reward": 0.2074999999254942, "reward_std": 0.11485048606991768, "rewards/custom_reward_logic_v2": 0.2074999999254942, "step": 420 }, { "completion_length": 19.7375, "epoch": 0.0034246302593958313, "grad_norm": 1.5309367179870605, "kl": 0.339575307816267, "learning_rate": 2.6090484684133406e-06, "loss": 0.0136, "reward": 0.03375000059604645, "reward_std": 0.08364979848265648, "rewards/custom_reward_logic_v2": 0.03375000059604645, "step": 430 }, { "completion_length": 29.5375, "epoch": 0.0035042728235678278, "grad_norm": 0.08438611030578613, "kl": 0.34286700189113617, "learning_rate": 2.5e-06, "loss": 0.0137, "reward": 0.08044999912381172, "reward_std": 0.16319628208875656, "rewards/custom_reward_logic_v2": 0.08044999912381172, "step": 440 }, { "completion_length": 23.2125, "epoch": 0.0035839153877398238, "grad_norm": 0.11283387243747711, "kl": 0.3263735562562943, "learning_rate": 2.3909515315866606e-06, "loss": 0.0131, "reward": 0.11125000230967999, "reward_std": 0.10089804157614708, "rewards/custom_reward_logic_v2": 0.11125000230967999, "step": 450 }, { "completion_length": 20.14375, "epoch": 0.0036635579519118198, "grad_norm": 0.7745999693870544, "kl": 0.3447819516062737, "learning_rate": 2.2821106431308546e-06, "loss": 0.0138, "reward": 0.11187500096857547, "reward_std": 0.12871785834431648, "rewards/custom_reward_logic_v2": 0.11187500096857547, "step": 460 }, { "completion_length": 27.35, "epoch": 0.003743200516083816, "grad_norm": 1.4974488019943237, "kl": 0.3425402037799358, "learning_rate": 2.173684519449872e-06, "loss": 0.0137, "reward": 0.10542500019073486, "reward_std": 0.22862085253000258, "rewards/custom_reward_logic_v2": 0.10542500019073486, "step": 470 }, { "completion_length": 19.40625, "epoch": 0.003822843080255812, "grad_norm": 1.1579034328460693, "kl": 0.3382424309849739, "learning_rate": 2.0658795558326745e-06, "loss": 0.0135, "reward": 0.15562499947845937, "reward_std": 0.1210292175412178, "rewards/custom_reward_logic_v2": 0.15562499947845937, "step": 480 }, { "completion_length": 37.24375, "epoch": 0.003902485644427808, "grad_norm": 0.7052723169326782, "kl": 0.35214473977684974, "learning_rate": 1.958900965154743e-06, "loss": 0.0141, "reward": 0.07051250115036964, "reward_std": 0.20176818892359732, "rewards/custom_reward_logic_v2": 0.07051250115036964, "step": 490 }, { "completion_length": 19.23125, "epoch": 0.003982128208599804, "grad_norm": 0.062097422778606415, "kl": 0.3568013899028301, "learning_rate": 1.852952387243698e-06, "loss": 0.0143, "reward": 0.13374999910593033, "reward_std": 0.10076134353876114, "rewards/custom_reward_logic_v2": 0.13374999910593033, "step": 500 }, { "completion_length": 18.61875, "epoch": 0.0040617707727718, "grad_norm": 1.673584222793579, "kl": 0.3592236742377281, "learning_rate": 1.7482355012393177e-06, "loss": 0.0144, "reward": 0.14000000134110452, "reward_std": 0.09233622029423713, "rewards/custom_reward_logic_v2": 0.14000000134110452, "step": 510 }, { "completion_length": 20.41875, "epoch": 0.004141413336943796, "grad_norm": 1.1183210611343384, "kl": 0.33706687912344935, "learning_rate": 1.6449496416858285e-06, "loss": 0.0135, "reward": 0.10500000081956387, "reward_std": 0.09869231358170509, "rewards/custom_reward_logic_v2": 0.10500000081956387, "step": 520 }, { "completion_length": 20.8125, "epoch": 0.004221055901115793, "grad_norm": 0.4382721185684204, "kl": 0.3610161267220974, "learning_rate": 1.5432914190872757e-06, "loss": 0.0144, "reward": 0.19562500044703485, "reward_std": 0.1188055507838726, "rewards/custom_reward_logic_v2": 0.19562500044703485, "step": 530 }, { "completion_length": 19.46875, "epoch": 0.004300698465287789, "grad_norm": 1.3095043897628784, "kl": 0.34188042730093005, "learning_rate": 1.443454345648252e-06, "loss": 0.0137, "reward": 0.20749999955296516, "reward_std": 0.19307591021060944, "rewards/custom_reward_logic_v2": 0.20749999955296516, "step": 540 }, { "completion_length": 21.63125, "epoch": 0.004380341029459785, "grad_norm": 0.04259713739156723, "kl": 0.3633933149278164, "learning_rate": 1.3456284669124159e-06, "loss": 0.0145, "reward": 0.11875000149011612, "reward_std": 0.1355846919119358, "rewards/custom_reward_logic_v2": 0.11875000149011612, "step": 550 }, { "completion_length": 19.2625, "epoch": 0.004459983593631781, "grad_norm": 0.5744329690933228, "kl": 0.3458960048854351, "learning_rate": 1.2500000000000007e-06, "loss": 0.0138, "reward": 0.3293750025331974, "reward_std": 0.104243653267622, "rewards/custom_reward_logic_v2": 0.3293750025331974, "step": 560 }, { "completion_length": 20.05625, "epoch": 0.004539626157803777, "grad_norm": 0.1549508273601532, "kl": 0.346449576318264, "learning_rate": 1.1567509791329402e-06, "loss": 0.0139, "reward": 0.17625000029802323, "reward_std": 0.1429968483746052, "rewards/custom_reward_logic_v2": 0.17625000029802323, "step": 570 }, { "completion_length": 19.86875, "epoch": 0.004619268721975773, "grad_norm": 0.14351911842823029, "kl": 0.39532790407538415, "learning_rate": 1.0660589091223854e-06, "loss": 0.0158, "reward": 0.17437500022351743, "reward_std": 0.13940104842185974, "rewards/custom_reward_logic_v2": 0.17437500022351743, "step": 580 }, { "completion_length": 19.28125, "epoch": 0.004698911286147769, "grad_norm": 1.1975979804992676, "kl": 0.3690756544470787, "learning_rate": 9.780964274781984e-07, "loss": 0.0148, "reward": 0.20562500059604644, "reward_std": 0.11339747980237007, "rewards/custom_reward_logic_v2": 0.20562500059604644, "step": 590 }, { "completion_length": 18.3125, "epoch": 0.004778553850319765, "grad_norm": 0.03664500266313553, "kl": 0.34169030636548997, "learning_rate": 8.930309757836517e-07, "loss": 0.0137, "reward": 0.12624999806284903, "reward_std": 0.1257291093468666, "rewards/custom_reward_logic_v2": 0.12624999806284903, "step": 600 }, { "completion_length": 20.73125, "epoch": 0.004858196414491761, "grad_norm": 1.0636727809906006, "kl": 0.32965768277645113, "learning_rate": 8.110244809608494e-07, "loss": 0.0132, "reward": 0.1900000013411045, "reward_std": 0.2276224449276924, "rewards/custom_reward_logic_v2": 0.1900000013411045, "step": 610 }, { "completion_length": 21.09375, "epoch": 0.004937838978663757, "grad_norm": 0.6413007378578186, "kl": 0.4171911731362343, "learning_rate": 7.322330470336314e-07, "loss": 0.0167, "reward": 0.06624999977648258, "reward_std": 0.08008950427174569, "rewards/custom_reward_logic_v2": 0.06624999977648258, "step": 620 }, { "completion_length": 19.6125, "epoch": 0.005017481542835753, "grad_norm": 0.4128471612930298, "kl": 0.3929149940609932, "learning_rate": 6.568066579746901e-07, "loss": 0.0157, "reward": 0.1518750011920929, "reward_std": 0.09802244454622269, "rewards/custom_reward_logic_v2": 0.1518750011920929, "step": 630 }, { "completion_length": 18.51875, "epoch": 0.00509712410700775, "grad_norm": 1.2197966575622559, "kl": 0.4328078910708427, "learning_rate": 5.848888922025553e-07, "loss": 0.0173, "reward": 0.045000001043081286, "reward_std": 0.08135274946689605, "rewards/custom_reward_logic_v2": 0.045000001043081286, "step": 640 }, { "completion_length": 19.89375, "epoch": 0.005176766671179746, "grad_norm": 0.24087023735046387, "kl": 0.39142851531505585, "learning_rate": 5.166166492719124e-07, "loss": 0.0157, "reward": 0.0818750023841858, "reward_std": 0.11013087928295136, "rewards/custom_reward_logic_v2": 0.0818750023841858, "step": 650 }, { "completion_length": 19.23125, "epoch": 0.005256409235351742, "grad_norm": 1.1138290166854858, "kl": 0.3609082795679569, "learning_rate": 4.5211988927752026e-07, "loss": 0.0144, "reward": 0.13375000059604644, "reward_std": 0.22290636524558066, "rewards/custom_reward_logic_v2": 0.13375000059604644, "step": 660 }, { "completion_length": 21.1375, "epoch": 0.005336051799523738, "grad_norm": 0.7887033820152283, "kl": 0.36398947462439535, "learning_rate": 3.915213854677863e-07, "loss": 0.0146, "reward": 0.07750000171363354, "reward_std": 0.09986742436885834, "rewards/custom_reward_logic_v2": 0.07750000171363354, "step": 670 }, { "completion_length": 20.7375, "epoch": 0.005415694363695734, "grad_norm": 1.2118674516677856, "kl": 0.34819948896765707, "learning_rate": 3.3493649053890325e-07, "loss": 0.0139, "reward": 0.14000000059604645, "reward_std": 0.15659263283014296, "rewards/custom_reward_logic_v2": 0.14000000059604645, "step": 680 }, { "completion_length": 25.03125, "epoch": 0.00549533692786773, "grad_norm": 0.6678434014320374, "kl": 0.3506194405257702, "learning_rate": 2.8247291705444575e-07, "loss": 0.014, "reward": 0.10087500289082527, "reward_std": 0.19995234534144402, "rewards/custom_reward_logic_v2": 0.10087500289082527, "step": 690 }, { "completion_length": 19.75, "epoch": 0.005574979492039726, "grad_norm": 1.028297781944275, "kl": 0.33339232876896857, "learning_rate": 2.3423053240837518e-07, "loss": 0.0133, "reward": 0.09312500022351741, "reward_std": 0.09688087031245232, "rewards/custom_reward_logic_v2": 0.09312500022351741, "step": 700 } ], "logging_steps": 10, "max_steps": 800, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }