| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.005574979492039726, | |
| "eval_steps": 500, | |
| "global_step": 700, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 739.7375, | |
| "epoch": 7.964256417199609e-05, | |
| "grad_norm": 0.17606443166732788, | |
| "kl": 0.0006033612473402173, | |
| "learning_rate": 6.25e-07, | |
| "loss": 0.0, | |
| "reward": -3.3789249688386915, | |
| "reward_std": 1.234160715341568, | |
| "rewards/custom_reward_logic_v2": -3.3789249688386915, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 881.3875, | |
| "epoch": 0.00015928512834399218, | |
| "grad_norm": 0.1873437762260437, | |
| "kl": 0.0007326043589273468, | |
| "learning_rate": 1.25e-06, | |
| "loss": 0.0, | |
| "reward": -4.281049972772598, | |
| "reward_std": 1.4586432427167892, | |
| "rewards/custom_reward_logic_v2": -4.281049972772598, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 798.325, | |
| "epoch": 0.00023892769251598824, | |
| "grad_norm": 0.16695357859134674, | |
| "kl": 0.0007708041899604723, | |
| "learning_rate": 1.8750000000000003e-06, | |
| "loss": 0.0, | |
| "reward": -3.79504998922348, | |
| "reward_std": 1.4004287779331208, | |
| "rewards/custom_reward_logic_v2": -3.79504998922348, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 885.76875, | |
| "epoch": 0.00031857025668798435, | |
| "grad_norm": 0.18757909536361694, | |
| "kl": 0.0007291340152733028, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.0, | |
| "reward": -3.8223875135183336, | |
| "reward_std": 1.053759826719761, | |
| "rewards/custom_reward_logic_v2": -3.8223875135183336, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 806.35625, | |
| "epoch": 0.0003982128208599804, | |
| "grad_norm": 0.1678084433078766, | |
| "kl": 0.000756343750981614, | |
| "learning_rate": 3.125e-06, | |
| "loss": 0.0, | |
| "reward": -3.831325000524521, | |
| "reward_std": 1.2611359059810638, | |
| "rewards/custom_reward_logic_v2": -3.831325000524521, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 860.95, | |
| "epoch": 0.0004778553850319765, | |
| "grad_norm": 0.16106949746608734, | |
| "kl": 0.0007545762317022308, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.0, | |
| "reward": -3.99547501206398, | |
| "reward_std": 1.233138319849968, | |
| "rewards/custom_reward_logic_v2": -3.99547501206398, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 831.175, | |
| "epoch": 0.0005574979492039726, | |
| "grad_norm": 0.1723652333021164, | |
| "kl": 0.0007971685263328254, | |
| "learning_rate": 4.3750000000000005e-06, | |
| "loss": 0.0, | |
| "reward": -4.036549943685531, | |
| "reward_std": 1.5394920334219933, | |
| "rewards/custom_reward_logic_v2": -4.036549943685531, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 874.325, | |
| "epoch": 0.0006371405133759687, | |
| "grad_norm": 0.2079666703939438, | |
| "kl": 0.0008876581850927323, | |
| "learning_rate": 5e-06, | |
| "loss": 0.0, | |
| "reward": -3.92242501154542, | |
| "reward_std": 1.2604085817933082, | |
| "rewards/custom_reward_logic_v2": -3.92242501154542, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 791.91875, | |
| "epoch": 0.0007167830775479647, | |
| "grad_norm": 0.16253575682640076, | |
| "kl": 0.0010255174711346626, | |
| "learning_rate": 4.997620553954645e-06, | |
| "loss": 0.0, | |
| "reward": -3.364587500691414, | |
| "reward_std": 1.2228698313236237, | |
| "rewards/custom_reward_logic_v2": -3.364587500691414, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 846.2875, | |
| "epoch": 0.0007964256417199608, | |
| "grad_norm": 0.18019770085811615, | |
| "kl": 0.0013353260728763416, | |
| "learning_rate": 4.990486745229364e-06, | |
| "loss": 0.0001, | |
| "reward": -3.805912530422211, | |
| "reward_std": 1.2458222389221192, | |
| "rewards/custom_reward_logic_v2": -3.805912530422211, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 729.04375, | |
| "epoch": 0.0008760682058919569, | |
| "grad_norm": 0.1617293506860733, | |
| "kl": 0.0018967354553751647, | |
| "learning_rate": 4.978612153434527e-06, | |
| "loss": 0.0001, | |
| "reward": -3.071175017207861, | |
| "reward_std": 1.3524149775505065, | |
| "rewards/custom_reward_logic_v2": -3.071175017207861, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 641.74375, | |
| "epoch": 0.000955710770063953, | |
| "grad_norm": 0.26608461141586304, | |
| "kl": 0.0029119997401721776, | |
| "learning_rate": 4.962019382530521e-06, | |
| "loss": 0.0001, | |
| "reward": -2.690687493979931, | |
| "reward_std": 1.0754198133945465, | |
| "rewards/custom_reward_logic_v2": -2.690687493979931, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 883.9875, | |
| "epoch": 0.001035353334235949, | |
| "grad_norm": 0.7612231373786926, | |
| "kl": 0.003597881377208978, | |
| "learning_rate": 4.9407400177998335e-06, | |
| "loss": 0.0001, | |
| "reward": -3.8035999715328215, | |
| "reward_std": 1.2502110481262207, | |
| "rewards/custom_reward_logic_v2": -3.8035999715328215, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 524.48125, | |
| "epoch": 0.0011149958984079452, | |
| "grad_norm": 0.9543402791023254, | |
| "kl": 0.08978197913384064, | |
| "learning_rate": 4.914814565722671e-06, | |
| "loss": 0.0036, | |
| "reward": -2.0596874909475447, | |
| "reward_std": 1.3678732179105282, | |
| "rewards/custom_reward_logic_v2": -2.0596874909475447, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 46.43125, | |
| "epoch": 0.0011946384625799412, | |
| "grad_norm": 0.7850804328918457, | |
| "kl": 0.3018287725746632, | |
| "learning_rate": 4.884292376870567e-06, | |
| "loss": 0.0121, | |
| "reward": -0.04024999849498272, | |
| "reward_std": 0.430637900531292, | |
| "rewards/custom_reward_logic_v2": -0.04024999849498272, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 18.93125, | |
| "epoch": 0.0012742810267519374, | |
| "grad_norm": 0.032512303441762924, | |
| "kl": 0.33459745422005654, | |
| "learning_rate": 4.849231551964771e-06, | |
| "loss": 0.0134, | |
| "reward": 0.1650000035762787, | |
| "reward_std": 0.07605109438300132, | |
| "rewards/custom_reward_logic_v2": 0.1650000035762787, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 17.4375, | |
| "epoch": 0.0013539235909239334, | |
| "grad_norm": 0.02004638873040676, | |
| "kl": 0.35064528286457064, | |
| "learning_rate": 4.809698831278217e-06, | |
| "loss": 0.014, | |
| "reward": 0.08999999985098839, | |
| "reward_std": 0.125558003783226, | |
| "rewards/custom_reward_logic_v2": 0.08999999985098839, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 23.975, | |
| "epoch": 0.0014335661550959294, | |
| "grad_norm": 0.2281995564699173, | |
| "kl": 0.3118164837360382, | |
| "learning_rate": 4.765769467591626e-06, | |
| "loss": 0.0125, | |
| "reward": 0.08099999986588954, | |
| "reward_std": 0.18301311507821083, | |
| "rewards/custom_reward_logic_v2": 0.08099999986588954, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 17.7, | |
| "epoch": 0.0015132087192679256, | |
| "grad_norm": 0.20832708477973938, | |
| "kl": 0.34881954491138456, | |
| "learning_rate": 4.717527082945555e-06, | |
| "loss": 0.014, | |
| "reward": 0.14687500111758708, | |
| "reward_std": 0.13193419948220253, | |
| "rewards/custom_reward_logic_v2": 0.14687500111758708, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 28.76875, | |
| "epoch": 0.0015928512834399217, | |
| "grad_norm": 0.2148224264383316, | |
| "kl": 0.4086977861821651, | |
| "learning_rate": 4.665063509461098e-06, | |
| "loss": 0.0163, | |
| "reward": 0.06411250084638595, | |
| "reward_std": 0.09681975245475768, | |
| "rewards/custom_reward_logic_v2": 0.06411250084638595, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 19.3, | |
| "epoch": 0.0016724938476119177, | |
| "grad_norm": 0.03454764187335968, | |
| "kl": 0.3337583176791668, | |
| "learning_rate": 4.608478614532215e-06, | |
| "loss": 0.0134, | |
| "reward": 0.21312500052154065, | |
| "reward_std": 0.1542310357093811, | |
| "rewards/custom_reward_logic_v2": 0.21312500052154065, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 48.96875, | |
| "epoch": 0.0017521364117839139, | |
| "grad_norm": 0.8877259492874146, | |
| "kl": 0.3230514988303185, | |
| "learning_rate": 4.54788011072248e-06, | |
| "loss": 0.0129, | |
| "reward": -0.12147499993443489, | |
| "reward_std": 0.4157312333583832, | |
| "rewards/custom_reward_logic_v2": -0.12147499993443489, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 19.41875, | |
| "epoch": 0.0018317789759559099, | |
| "grad_norm": 0.7465932369232178, | |
| "kl": 0.32680382803082464, | |
| "learning_rate": 4.4833833507280884e-06, | |
| "loss": 0.0131, | |
| "reward": 0.14000000059604645, | |
| "reward_std": 0.09731742069125175, | |
| "rewards/custom_reward_logic_v2": 0.14000000059604645, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 23.425, | |
| "epoch": 0.001911421540127906, | |
| "grad_norm": 0.4111487567424774, | |
| "kl": 0.3509559452533722, | |
| "learning_rate": 4.415111107797445e-06, | |
| "loss": 0.014, | |
| "reward": 0.18286250159144402, | |
| "reward_std": 0.1811980500817299, | |
| "rewards/custom_reward_logic_v2": 0.18286250159144402, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 18.91875, | |
| "epoch": 0.001991064104299902, | |
| "grad_norm": 0.8882763385772705, | |
| "kl": 0.3525215476751328, | |
| "learning_rate": 4.34319334202531e-06, | |
| "loss": 0.0141, | |
| "reward": 0.17062499970197678, | |
| "reward_std": 0.11504097878932953, | |
| "rewards/custom_reward_logic_v2": 0.17062499970197678, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 20.475, | |
| "epoch": 0.002070706668471898, | |
| "grad_norm": 0.03645075112581253, | |
| "kl": 0.3291649468243122, | |
| "learning_rate": 4.267766952966369e-06, | |
| "loss": 0.0132, | |
| "reward": 0.16500000059604644, | |
| "reward_std": 0.1858065977692604, | |
| "rewards/custom_reward_logic_v2": 0.16500000059604644, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 24.5625, | |
| "epoch": 0.0021503492326438944, | |
| "grad_norm": 1.1167131662368774, | |
| "kl": 0.33756194859743116, | |
| "learning_rate": 4.188975519039151e-06, | |
| "loss": 0.0135, | |
| "reward": 0.10505000110715627, | |
| "reward_std": 0.0828484557569027, | |
| "rewards/custom_reward_logic_v2": 0.10505000110715627, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 19.925, | |
| "epoch": 0.0022299917968158904, | |
| "grad_norm": 0.8635123372077942, | |
| "kl": 0.32979664355516436, | |
| "learning_rate": 4.106969024216348e-06, | |
| "loss": 0.0132, | |
| "reward": 0.20062500163912772, | |
| "reward_std": 0.1258012667298317, | |
| "rewards/custom_reward_logic_v2": 0.20062500163912772, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 41.35625, | |
| "epoch": 0.0023096343609878864, | |
| "grad_norm": 0.7731335163116455, | |
| "kl": 0.29900490418076514, | |
| "learning_rate": 4.021903572521802e-06, | |
| "loss": 0.012, | |
| "reward": 0.13356250263750552, | |
| "reward_std": 0.11855373680591583, | |
| "rewards/custom_reward_logic_v2": 0.13356250263750552, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 21.425, | |
| "epoch": 0.0023892769251598824, | |
| "grad_norm": 0.050558220595121384, | |
| "kl": 0.30905950888991357, | |
| "learning_rate": 3.933941090877615e-06, | |
| "loss": 0.0124, | |
| "reward": 0.10625000111758709, | |
| "reward_std": 0.07851103022694587, | |
| "rewards/custom_reward_logic_v2": 0.10625000111758709, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 28.39375, | |
| "epoch": 0.0024689194893318784, | |
| "grad_norm": 1.2737127542495728, | |
| "kl": 0.3259002223610878, | |
| "learning_rate": 3.8432490208670605e-06, | |
| "loss": 0.013, | |
| "reward": 0.07012500055134296, | |
| "reward_std": 0.21550666987895967, | |
| "rewards/custom_reward_logic_v2": 0.07012500055134296, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 20.49375, | |
| "epoch": 0.002548562053503875, | |
| "grad_norm": 1.3667010068893433, | |
| "kl": 0.32961594611406325, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.0132, | |
| "reward": 0.15562500059604645, | |
| "reward_std": 0.14379026368260384, | |
| "rewards/custom_reward_logic_v2": 0.15562500059604645, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 23.7625, | |
| "epoch": 0.002628204617675871, | |
| "grad_norm": 0.9662195444107056, | |
| "kl": 0.3291011206805706, | |
| "learning_rate": 3.654371533087586e-06, | |
| "loss": 0.0132, | |
| "reward": 0.20617500003427267, | |
| "reward_std": 0.12530190348625184, | |
| "rewards/custom_reward_logic_v2": 0.20617500003427267, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 19.15, | |
| "epoch": 0.002707847181847867, | |
| "grad_norm": 2.964785099029541, | |
| "kl": 0.3629206448793411, | |
| "learning_rate": 3.556545654351749e-06, | |
| "loss": 0.0145, | |
| "reward": 0.10437500067055225, | |
| "reward_std": 0.12071752324700355, | |
| "rewards/custom_reward_logic_v2": 0.10437500067055225, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 20.4875, | |
| "epoch": 0.002787489746019863, | |
| "grad_norm": 1.0044533014297485, | |
| "kl": 0.3254102662205696, | |
| "learning_rate": 3.4567085809127247e-06, | |
| "loss": 0.013, | |
| "reward": 0.15562499798834323, | |
| "reward_std": 0.15355074554681777, | |
| "rewards/custom_reward_logic_v2": 0.15562499798834323, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 21.21875, | |
| "epoch": 0.002867132310191859, | |
| "grad_norm": 0.8673160672187805, | |
| "kl": 0.328788036108017, | |
| "learning_rate": 3.3550503583141726e-06, | |
| "loss": 0.0132, | |
| "reward": 0.22808750197291375, | |
| "reward_std": 0.14038661643862724, | |
| "rewards/custom_reward_logic_v2": 0.22808750197291375, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 18.29375, | |
| "epoch": 0.002946774874363855, | |
| "grad_norm": 1.275578260421753, | |
| "kl": 0.3586613781750202, | |
| "learning_rate": 3.2517644987606827e-06, | |
| "loss": 0.0143, | |
| "reward": 0.09437500052154064, | |
| "reward_std": 0.13283729180693626, | |
| "rewards/custom_reward_logic_v2": 0.09437500052154064, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 19.625, | |
| "epoch": 0.0030264174385358513, | |
| "grad_norm": 1.135249376296997, | |
| "kl": 0.3399433046579361, | |
| "learning_rate": 3.147047612756302e-06, | |
| "loss": 0.0136, | |
| "reward": 0.18000000156462193, | |
| "reward_std": 0.1102687232196331, | |
| "rewards/custom_reward_logic_v2": 0.18000000156462193, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 18.65625, | |
| "epoch": 0.0031060600027078473, | |
| "grad_norm": 0.0214656013995409, | |
| "kl": 0.3453727260231972, | |
| "learning_rate": 3.0410990348452572e-06, | |
| "loss": 0.0138, | |
| "reward": 0.14312500059604644, | |
| "reward_std": 0.21185824573040007, | |
| "rewards/custom_reward_logic_v2": 0.14312500059604644, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 22.5875, | |
| "epoch": 0.0031857025668798433, | |
| "grad_norm": 1.1392817497253418, | |
| "kl": 0.3561431519687176, | |
| "learning_rate": 2.9341204441673267e-06, | |
| "loss": 0.0142, | |
| "reward": 0.09312500022351741, | |
| "reward_std": 0.09467698186635971, | |
| "rewards/custom_reward_logic_v2": 0.09312500022351741, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 21.9125, | |
| "epoch": 0.0032653451310518393, | |
| "grad_norm": 0.12919628620147705, | |
| "kl": 0.3515960440039635, | |
| "learning_rate": 2.82631548055013e-06, | |
| "loss": 0.0141, | |
| "reward": 0.07376250103116036, | |
| "reward_std": 0.15707473903894426, | |
| "rewards/custom_reward_logic_v2": 0.07376250103116036, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 20.325, | |
| "epoch": 0.0033449876952238353, | |
| "grad_norm": 0.08202961087226868, | |
| "kl": 0.34852803200483323, | |
| "learning_rate": 2.717889356869146e-06, | |
| "loss": 0.0139, | |
| "reward": 0.2074999999254942, | |
| "reward_std": 0.11485048606991768, | |
| "rewards/custom_reward_logic_v2": 0.2074999999254942, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 19.7375, | |
| "epoch": 0.0034246302593958313, | |
| "grad_norm": 1.5309367179870605, | |
| "kl": 0.339575307816267, | |
| "learning_rate": 2.6090484684133406e-06, | |
| "loss": 0.0136, | |
| "reward": 0.03375000059604645, | |
| "reward_std": 0.08364979848265648, | |
| "rewards/custom_reward_logic_v2": 0.03375000059604645, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 29.5375, | |
| "epoch": 0.0035042728235678278, | |
| "grad_norm": 0.08438611030578613, | |
| "kl": 0.34286700189113617, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.0137, | |
| "reward": 0.08044999912381172, | |
| "reward_std": 0.16319628208875656, | |
| "rewards/custom_reward_logic_v2": 0.08044999912381172, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 23.2125, | |
| "epoch": 0.0035839153877398238, | |
| "grad_norm": 0.11283387243747711, | |
| "kl": 0.3263735562562943, | |
| "learning_rate": 2.3909515315866606e-06, | |
| "loss": 0.0131, | |
| "reward": 0.11125000230967999, | |
| "reward_std": 0.10089804157614708, | |
| "rewards/custom_reward_logic_v2": 0.11125000230967999, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 20.14375, | |
| "epoch": 0.0036635579519118198, | |
| "grad_norm": 0.7745999693870544, | |
| "kl": 0.3447819516062737, | |
| "learning_rate": 2.2821106431308546e-06, | |
| "loss": 0.0138, | |
| "reward": 0.11187500096857547, | |
| "reward_std": 0.12871785834431648, | |
| "rewards/custom_reward_logic_v2": 0.11187500096857547, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 27.35, | |
| "epoch": 0.003743200516083816, | |
| "grad_norm": 1.4974488019943237, | |
| "kl": 0.3425402037799358, | |
| "learning_rate": 2.173684519449872e-06, | |
| "loss": 0.0137, | |
| "reward": 0.10542500019073486, | |
| "reward_std": 0.22862085253000258, | |
| "rewards/custom_reward_logic_v2": 0.10542500019073486, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 19.40625, | |
| "epoch": 0.003822843080255812, | |
| "grad_norm": 1.1579034328460693, | |
| "kl": 0.3382424309849739, | |
| "learning_rate": 2.0658795558326745e-06, | |
| "loss": 0.0135, | |
| "reward": 0.15562499947845937, | |
| "reward_std": 0.1210292175412178, | |
| "rewards/custom_reward_logic_v2": 0.15562499947845937, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 37.24375, | |
| "epoch": 0.003902485644427808, | |
| "grad_norm": 0.7052723169326782, | |
| "kl": 0.35214473977684974, | |
| "learning_rate": 1.958900965154743e-06, | |
| "loss": 0.0141, | |
| "reward": 0.07051250115036964, | |
| "reward_std": 0.20176818892359732, | |
| "rewards/custom_reward_logic_v2": 0.07051250115036964, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 19.23125, | |
| "epoch": 0.003982128208599804, | |
| "grad_norm": 0.062097422778606415, | |
| "kl": 0.3568013899028301, | |
| "learning_rate": 1.852952387243698e-06, | |
| "loss": 0.0143, | |
| "reward": 0.13374999910593033, | |
| "reward_std": 0.10076134353876114, | |
| "rewards/custom_reward_logic_v2": 0.13374999910593033, | |
| "step": 500 | |
| }, | |
| { | |
| "completion_length": 18.61875, | |
| "epoch": 0.0040617707727718, | |
| "grad_norm": 1.673584222793579, | |
| "kl": 0.3592236742377281, | |
| "learning_rate": 1.7482355012393177e-06, | |
| "loss": 0.0144, | |
| "reward": 0.14000000134110452, | |
| "reward_std": 0.09233622029423713, | |
| "rewards/custom_reward_logic_v2": 0.14000000134110452, | |
| "step": 510 | |
| }, | |
| { | |
| "completion_length": 20.41875, | |
| "epoch": 0.004141413336943796, | |
| "grad_norm": 1.1183210611343384, | |
| "kl": 0.33706687912344935, | |
| "learning_rate": 1.6449496416858285e-06, | |
| "loss": 0.0135, | |
| "reward": 0.10500000081956387, | |
| "reward_std": 0.09869231358170509, | |
| "rewards/custom_reward_logic_v2": 0.10500000081956387, | |
| "step": 520 | |
| }, | |
| { | |
| "completion_length": 20.8125, | |
| "epoch": 0.004221055901115793, | |
| "grad_norm": 0.4382721185684204, | |
| "kl": 0.3610161267220974, | |
| "learning_rate": 1.5432914190872757e-06, | |
| "loss": 0.0144, | |
| "reward": 0.19562500044703485, | |
| "reward_std": 0.1188055507838726, | |
| "rewards/custom_reward_logic_v2": 0.19562500044703485, | |
| "step": 530 | |
| }, | |
| { | |
| "completion_length": 19.46875, | |
| "epoch": 0.004300698465287789, | |
| "grad_norm": 1.3095043897628784, | |
| "kl": 0.34188042730093005, | |
| "learning_rate": 1.443454345648252e-06, | |
| "loss": 0.0137, | |
| "reward": 0.20749999955296516, | |
| "reward_std": 0.19307591021060944, | |
| "rewards/custom_reward_logic_v2": 0.20749999955296516, | |
| "step": 540 | |
| }, | |
| { | |
| "completion_length": 21.63125, | |
| "epoch": 0.004380341029459785, | |
| "grad_norm": 0.04259713739156723, | |
| "kl": 0.3633933149278164, | |
| "learning_rate": 1.3456284669124159e-06, | |
| "loss": 0.0145, | |
| "reward": 0.11875000149011612, | |
| "reward_std": 0.1355846919119358, | |
| "rewards/custom_reward_logic_v2": 0.11875000149011612, | |
| "step": 550 | |
| }, | |
| { | |
| "completion_length": 19.2625, | |
| "epoch": 0.004459983593631781, | |
| "grad_norm": 0.5744329690933228, | |
| "kl": 0.3458960048854351, | |
| "learning_rate": 1.2500000000000007e-06, | |
| "loss": 0.0138, | |
| "reward": 0.3293750025331974, | |
| "reward_std": 0.104243653267622, | |
| "rewards/custom_reward_logic_v2": 0.3293750025331974, | |
| "step": 560 | |
| }, | |
| { | |
| "completion_length": 20.05625, | |
| "epoch": 0.004539626157803777, | |
| "grad_norm": 0.1549508273601532, | |
| "kl": 0.346449576318264, | |
| "learning_rate": 1.1567509791329402e-06, | |
| "loss": 0.0139, | |
| "reward": 0.17625000029802323, | |
| "reward_std": 0.1429968483746052, | |
| "rewards/custom_reward_logic_v2": 0.17625000029802323, | |
| "step": 570 | |
| }, | |
| { | |
| "completion_length": 19.86875, | |
| "epoch": 0.004619268721975773, | |
| "grad_norm": 0.14351911842823029, | |
| "kl": 0.39532790407538415, | |
| "learning_rate": 1.0660589091223854e-06, | |
| "loss": 0.0158, | |
| "reward": 0.17437500022351743, | |
| "reward_std": 0.13940104842185974, | |
| "rewards/custom_reward_logic_v2": 0.17437500022351743, | |
| "step": 580 | |
| }, | |
| { | |
| "completion_length": 19.28125, | |
| "epoch": 0.004698911286147769, | |
| "grad_norm": 1.1975979804992676, | |
| "kl": 0.3690756544470787, | |
| "learning_rate": 9.780964274781984e-07, | |
| "loss": 0.0148, | |
| "reward": 0.20562500059604644, | |
| "reward_std": 0.11339747980237007, | |
| "rewards/custom_reward_logic_v2": 0.20562500059604644, | |
| "step": 590 | |
| }, | |
| { | |
| "completion_length": 18.3125, | |
| "epoch": 0.004778553850319765, | |
| "grad_norm": 0.03664500266313553, | |
| "kl": 0.34169030636548997, | |
| "learning_rate": 8.930309757836517e-07, | |
| "loss": 0.0137, | |
| "reward": 0.12624999806284903, | |
| "reward_std": 0.1257291093468666, | |
| "rewards/custom_reward_logic_v2": 0.12624999806284903, | |
| "step": 600 | |
| }, | |
| { | |
| "completion_length": 20.73125, | |
| "epoch": 0.004858196414491761, | |
| "grad_norm": 1.0636727809906006, | |
| "kl": 0.32965768277645113, | |
| "learning_rate": 8.110244809608494e-07, | |
| "loss": 0.0132, | |
| "reward": 0.1900000013411045, | |
| "reward_std": 0.2276224449276924, | |
| "rewards/custom_reward_logic_v2": 0.1900000013411045, | |
| "step": 610 | |
| }, | |
| { | |
| "completion_length": 21.09375, | |
| "epoch": 0.004937838978663757, | |
| "grad_norm": 0.6413007378578186, | |
| "kl": 0.4171911731362343, | |
| "learning_rate": 7.322330470336314e-07, | |
| "loss": 0.0167, | |
| "reward": 0.06624999977648258, | |
| "reward_std": 0.08008950427174569, | |
| "rewards/custom_reward_logic_v2": 0.06624999977648258, | |
| "step": 620 | |
| }, | |
| { | |
| "completion_length": 19.6125, | |
| "epoch": 0.005017481542835753, | |
| "grad_norm": 0.4128471612930298, | |
| "kl": 0.3929149940609932, | |
| "learning_rate": 6.568066579746901e-07, | |
| "loss": 0.0157, | |
| "reward": 0.1518750011920929, | |
| "reward_std": 0.09802244454622269, | |
| "rewards/custom_reward_logic_v2": 0.1518750011920929, | |
| "step": 630 | |
| }, | |
| { | |
| "completion_length": 18.51875, | |
| "epoch": 0.00509712410700775, | |
| "grad_norm": 1.2197966575622559, | |
| "kl": 0.4328078910708427, | |
| "learning_rate": 5.848888922025553e-07, | |
| "loss": 0.0173, | |
| "reward": 0.045000001043081286, | |
| "reward_std": 0.08135274946689605, | |
| "rewards/custom_reward_logic_v2": 0.045000001043081286, | |
| "step": 640 | |
| }, | |
| { | |
| "completion_length": 19.89375, | |
| "epoch": 0.005176766671179746, | |
| "grad_norm": 0.24087023735046387, | |
| "kl": 0.39142851531505585, | |
| "learning_rate": 5.166166492719124e-07, | |
| "loss": 0.0157, | |
| "reward": 0.0818750023841858, | |
| "reward_std": 0.11013087928295136, | |
| "rewards/custom_reward_logic_v2": 0.0818750023841858, | |
| "step": 650 | |
| }, | |
| { | |
| "completion_length": 19.23125, | |
| "epoch": 0.005256409235351742, | |
| "grad_norm": 1.1138290166854858, | |
| "kl": 0.3609082795679569, | |
| "learning_rate": 4.5211988927752026e-07, | |
| "loss": 0.0144, | |
| "reward": 0.13375000059604644, | |
| "reward_std": 0.22290636524558066, | |
| "rewards/custom_reward_logic_v2": 0.13375000059604644, | |
| "step": 660 | |
| }, | |
| { | |
| "completion_length": 21.1375, | |
| "epoch": 0.005336051799523738, | |
| "grad_norm": 0.7887033820152283, | |
| "kl": 0.36398947462439535, | |
| "learning_rate": 3.915213854677863e-07, | |
| "loss": 0.0146, | |
| "reward": 0.07750000171363354, | |
| "reward_std": 0.09986742436885834, | |
| "rewards/custom_reward_logic_v2": 0.07750000171363354, | |
| "step": 670 | |
| }, | |
| { | |
| "completion_length": 20.7375, | |
| "epoch": 0.005415694363695734, | |
| "grad_norm": 1.2118674516677856, | |
| "kl": 0.34819948896765707, | |
| "learning_rate": 3.3493649053890325e-07, | |
| "loss": 0.0139, | |
| "reward": 0.14000000059604645, | |
| "reward_std": 0.15659263283014296, | |
| "rewards/custom_reward_logic_v2": 0.14000000059604645, | |
| "step": 680 | |
| }, | |
| { | |
| "completion_length": 25.03125, | |
| "epoch": 0.00549533692786773, | |
| "grad_norm": 0.6678434014320374, | |
| "kl": 0.3506194405257702, | |
| "learning_rate": 2.8247291705444575e-07, | |
| "loss": 0.014, | |
| "reward": 0.10087500289082527, | |
| "reward_std": 0.19995234534144402, | |
| "rewards/custom_reward_logic_v2": 0.10087500289082527, | |
| "step": 690 | |
| }, | |
| { | |
| "completion_length": 19.75, | |
| "epoch": 0.005574979492039726, | |
| "grad_norm": 1.028297781944275, | |
| "kl": 0.33339232876896857, | |
| "learning_rate": 2.3423053240837518e-07, | |
| "loss": 0.0133, | |
| "reward": 0.09312500022351741, | |
| "reward_std": 0.09688087031245232, | |
| "rewards/custom_reward_logic_v2": 0.09312500022351741, | |
| "step": 700 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 800, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |