{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.01851689616998909, "eval_steps": 500, "global_step": 2325, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 714.1125, "epoch": 7.964256417199609e-05, "grad_norm": 0.11108597368001938, "kl": 0.0005961419927189126, "learning_rate": 2.0833333333333333e-07, "loss": 0.0, "reward": 0.540625, "reward_std": 0.29713641852140427, "rewards/custom_reward_simplified_v7_dblog": 0.540625, "step": 10 }, { "completion_length": 800.6625, "epoch": 0.00015928512834399218, "grad_norm": 0.1964382529258728, "kl": 0.0007280149788130075, "learning_rate": 4.1666666666666667e-07, "loss": 0.0, "reward": 0.496875, "reward_std": 0.25719649270176886, "rewards/custom_reward_simplified_v7_dblog": 0.496875, "step": 20 }, { "completion_length": 750.46875, "epoch": 0.00023892769251598824, "grad_norm": 0.15792745351791382, "kl": 0.0007828957575839012, "learning_rate": 6.25e-07, "loss": 0.0, "reward": 0.684375, "reward_std": 0.3755971297621727, "rewards/custom_reward_simplified_v7_dblog": 0.684375, "step": 30 }, { "completion_length": 813.94375, "epoch": 0.00031857025668798435, "grad_norm": 0.12503573298454285, "kl": 0.0007155703555326909, "learning_rate": 8.333333333333333e-07, "loss": 0.0, "reward": 0.565625, "reward_std": 0.2761854581534863, "rewards/custom_reward_simplified_v7_dblog": 0.565625, "step": 40 }, { "completion_length": 747.675, "epoch": 0.0003982128208599804, "grad_norm": 0.10329681634902954, "kl": 0.0007686431898036971, "learning_rate": 1.0416666666666667e-06, "loss": 0.0, "reward": 0.621875, "reward_std": 0.30715219378471376, "rewards/custom_reward_simplified_v7_dblog": 0.621875, "step": 50 }, { "completion_length": 821.60625, "epoch": 0.0004778553850319765, "grad_norm": 0.1834840029478073, "kl": 0.0007538022648077459, "learning_rate": 1.25e-06, "loss": 0.0, "reward": 0.578125, "reward_std": 0.39505376294255257, "rewards/custom_reward_simplified_v7_dblog": 0.578125, "step": 60 }, { "completion_length": 776.75, "epoch": 0.0005574979492039726, "grad_norm": 0.11483483016490936, "kl": 0.0007510531373554841, "learning_rate": 1.4583333333333335e-06, "loss": 0.0, "reward": 0.584375, "reward_std": 0.32483330443501474, "rewards/custom_reward_simplified_v7_dblog": 0.584375, "step": 70 }, { "completion_length": 804.675, "epoch": 0.0006371405133759687, "grad_norm": 0.17995329201221466, "kl": 0.0007302156562218442, "learning_rate": 1.6666666666666667e-06, "loss": 0.0, "reward": 0.703125, "reward_std": 0.32263160347938535, "rewards/custom_reward_simplified_v7_dblog": 0.703125, "step": 80 }, { "completion_length": 793.0875, "epoch": 0.0007167830775479647, "grad_norm": 0.16513389348983765, "kl": 0.0007239854254294187, "learning_rate": 1.8750000000000003e-06, "loss": 0.0, "reward": 0.709375, "reward_std": 0.3102527566254139, "rewards/custom_reward_simplified_v7_dblog": 0.709375, "step": 90 }, { "completion_length": 812.0, "epoch": 0.0007964256417199608, "grad_norm": 0.1802467256784439, "kl": 0.0007639184041181579, "learning_rate": 2.0833333333333334e-06, "loss": 0.0, "reward": 0.528125, "reward_std": 0.21242836564779283, "rewards/custom_reward_simplified_v7_dblog": 0.528125, "step": 100 }, { "completion_length": 784.64375, "epoch": 0.0008760682058919569, "grad_norm": 0.17609436810016632, "kl": 0.0007660316972760483, "learning_rate": 2.2916666666666666e-06, "loss": 0.0, "reward": 0.565625, "reward_std": 0.3309394560754299, "rewards/custom_reward_simplified_v7_dblog": 0.565625, "step": 110 }, { "completion_length": 717.24375, "epoch": 0.000955710770063953, "grad_norm": 0.14550578594207764, "kl": 0.0007782038446748629, "learning_rate": 2.5e-06, "loss": 0.0, "reward": 0.728125, "reward_std": 0.2573545627295971, "rewards/custom_reward_simplified_v7_dblog": 0.728125, "step": 120 }, { "completion_length": 872.6375, "epoch": 0.001035353334235949, "grad_norm": 0.11807532608509064, "kl": 0.0007370044564595446, "learning_rate": 2.7083333333333334e-06, "loss": 0.0, "reward": 0.45, "reward_std": 0.24368184804916382, "rewards/custom_reward_simplified_v7_dblog": 0.45, "step": 130 }, { "completion_length": 780.325, "epoch": 0.0011149958984079452, "grad_norm": 0.21067936718463898, "kl": 0.0007969280297402293, "learning_rate": 2.916666666666667e-06, "loss": 0.0, "reward": 0.671875, "reward_std": 0.3312204420566559, "rewards/custom_reward_simplified_v7_dblog": 0.671875, "step": 140 }, { "completion_length": 796.15625, "epoch": 0.0011946384625799412, "grad_norm": 0.11178277432918549, "kl": 0.0007584215141832829, "learning_rate": 3.125e-06, "loss": 0.0, "reward": 0.675, "reward_std": 0.2411833107471466, "rewards/custom_reward_simplified_v7_dblog": 0.675, "step": 150 }, { "completion_length": 735.4375, "epoch": 0.0012742810267519374, "grad_norm": 0.12408847361803055, "kl": 0.0008089728711638599, "learning_rate": 3.3333333333333333e-06, "loss": 0.0, "reward": 0.5875, "reward_std": 0.2907834567129612, "rewards/custom_reward_simplified_v7_dblog": 0.5875, "step": 160 }, { "completion_length": 630.76875, "epoch": 0.0013539235909239334, "grad_norm": 0.14481835067272186, "kl": 0.0008351787488209084, "learning_rate": 3.5416666666666673e-06, "loss": 0.0, "reward": 0.828125, "reward_std": 0.3232325129210949, "rewards/custom_reward_simplified_v7_dblog": 0.828125, "step": 170 }, { "completion_length": 704.2, "epoch": 0.0014335661550959294, "grad_norm": 0.22581899166107178, "kl": 0.0008706353197339922, "learning_rate": 3.7500000000000005e-06, "loss": 0.0, "reward": 0.621875, "reward_std": 0.2438264600932598, "rewards/custom_reward_simplified_v7_dblog": 0.621875, "step": 180 }, { "completion_length": 738.2625, "epoch": 0.0015132087192679256, "grad_norm": 0.20901009440422058, "kl": 0.000852665287675336, "learning_rate": 3.958333333333333e-06, "loss": 0.0, "reward": 0.659375, "reward_std": 0.2661551833152771, "rewards/custom_reward_simplified_v7_dblog": 0.659375, "step": 190 }, { "completion_length": 773.31875, "epoch": 0.0015928512834399217, "grad_norm": 0.14023999869823456, "kl": 0.0008427878346992657, "learning_rate": 4.166666666666667e-06, "loss": 0.0, "reward": 0.575, "reward_std": 0.263551290333271, "rewards/custom_reward_simplified_v7_dblog": 0.575, "step": 200 }, { "completion_length": 760.4, "epoch": 0.0016724938476119177, "grad_norm": 0.15415024757385254, "kl": 0.0009272771596442908, "learning_rate": 4.3750000000000005e-06, "loss": 0.0, "reward": 0.578125, "reward_std": 0.3055797599256039, "rewards/custom_reward_simplified_v7_dblog": 0.578125, "step": 210 }, { "completion_length": 824.94375, "epoch": 0.0017521364117839139, "grad_norm": 0.18523605167865753, "kl": 0.0008898543601389974, "learning_rate": 4.583333333333333e-06, "loss": 0.0, "reward": 0.58125, "reward_std": 0.2951655209064484, "rewards/custom_reward_simplified_v7_dblog": 0.58125, "step": 220 }, { "completion_length": 748.5375, "epoch": 0.0018317789759559099, "grad_norm": 0.11306847631931305, "kl": 0.0009787698683794588, "learning_rate": 4.791666666666668e-06, "loss": 0.0, "reward": 0.590625, "reward_std": 0.2887454777956009, "rewards/custom_reward_simplified_v7_dblog": 0.590625, "step": 230 }, { "completion_length": 753.39375, "epoch": 0.001911421540127906, "grad_norm": 0.0014718669699504972, "kl": 0.0010118414385942743, "learning_rate": 5e-06, "loss": 0.0, "reward": 0.55625, "reward_std": 0.1977315753698349, "rewards/custom_reward_simplified_v7_dblog": 0.55625, "step": 240 }, { "completion_length": 812.71875, "epoch": 0.001991064104299902, "grad_norm": 0.11223085969686508, "kl": 0.0010390775743871928, "learning_rate": 4.999735579817769e-06, "loss": 0.0, "reward": 0.6875, "reward_std": 0.24264758601784706, "rewards/custom_reward_simplified_v7_dblog": 0.6875, "step": 250 }, { "completion_length": 731.66875, "epoch": 0.002070706668471898, "grad_norm": 0.1944543570280075, "kl": 0.001084678602637723, "learning_rate": 4.998942375205502e-06, "loss": 0.0, "reward": 0.796875, "reward_std": 0.31279500126838683, "rewards/custom_reward_simplified_v7_dblog": 0.796875, "step": 260 }, { "completion_length": 770.69375, "epoch": 0.0021503492326438944, "grad_norm": 0.10904921591281891, "kl": 0.0012701354396995157, "learning_rate": 4.997620553954645e-06, "loss": 0.0001, "reward": 0.653125, "reward_std": 0.1583670809864998, "rewards/custom_reward_simplified_v7_dblog": 0.653125, "step": 270 }, { "completion_length": 646.63125, "epoch": 0.0022299917968158904, "grad_norm": 0.11776451766490936, "kl": 0.0026803009008290247, "learning_rate": 4.995770395678171e-06, "loss": 0.0001, "reward": 0.78125, "reward_std": 0.37105962783098223, "rewards/custom_reward_simplified_v7_dblog": 0.78125, "step": 280 }, { "completion_length": 850.5125, "epoch": 0.0023096343609878864, "grad_norm": 0.17029190063476562, "kl": 0.0011812534503405914, "learning_rate": 4.993392291751431e-06, "loss": 0.0, "reward": 0.503125, "reward_std": 0.2320079453289509, "rewards/custom_reward_simplified_v7_dblog": 0.503125, "step": 290 }, { "completion_length": 774.4125, "epoch": 0.0023892769251598824, "grad_norm": 0.17417992651462555, "kl": 0.001456298804259859, "learning_rate": 4.990486745229364e-06, "loss": 0.0001, "reward": 0.621875, "reward_std": 0.24568462520837783, "rewards/custom_reward_simplified_v7_dblog": 0.621875, "step": 300 }, { "completion_length": 733.675, "epoch": 0.0024689194893318784, "grad_norm": 0.13222694396972656, "kl": 0.001577114372048527, "learning_rate": 4.9870543707400835e-06, "loss": 0.0001, "reward": 0.653125, "reward_std": 0.27914761677384375, "rewards/custom_reward_simplified_v7_dblog": 0.653125, "step": 310 }, { "completion_length": 711.91875, "epoch": 0.002548562053503875, "grad_norm": 0.19241130352020264, "kl": 0.0017230566183570773, "learning_rate": 4.983095894354858e-06, "loss": 0.0001, "reward": 0.68125, "reward_std": 0.3178554192185402, "rewards/custom_reward_simplified_v7_dblog": 0.68125, "step": 320 }, { "completion_length": 780.34375, "epoch": 0.002628204617675871, "grad_norm": 0.1997414082288742, "kl": 0.002029248425969854, "learning_rate": 4.978612153434527e-06, "loss": 0.0001, "reward": 0.696875, "reward_std": 0.32896072417497635, "rewards/custom_reward_simplified_v7_dblog": 0.696875, "step": 330 }, { "completion_length": 695.63125, "epoch": 0.002707847181847867, "grad_norm": 0.18966233730316162, "kl": 0.002277573832543567, "learning_rate": 4.973604096452361e-06, "loss": 0.0001, "reward": 0.684375, "reward_std": 0.2995404839515686, "rewards/custom_reward_simplified_v7_dblog": 0.684375, "step": 340 }, { "completion_length": 719.425, "epoch": 0.002787489746019863, "grad_norm": 0.17769980430603027, "kl": 0.002305405435618013, "learning_rate": 4.968072782793436e-06, "loss": 0.0001, "reward": 0.74375, "reward_std": 0.3807508498430252, "rewards/custom_reward_simplified_v7_dblog": 0.74375, "step": 350 }, { "completion_length": 732.4375, "epoch": 0.002867132310191859, "grad_norm": 0.21898534893989563, "kl": 0.002607938600704074, "learning_rate": 4.962019382530521e-06, "loss": 0.0001, "reward": 0.596875, "reward_std": 0.303117785602808, "rewards/custom_reward_simplified_v7_dblog": 0.596875, "step": 360 }, { "completion_length": 703.21875, "epoch": 0.002946774874363855, "grad_norm": 0.20463985204696655, "kl": 0.0030091375578194858, "learning_rate": 4.955445176176577e-06, "loss": 0.0001, "reward": 0.746875, "reward_std": 0.28880608528852464, "rewards/custom_reward_simplified_v7_dblog": 0.746875, "step": 370 }, { "completion_length": 646.95625, "epoch": 0.0030264174385358513, "grad_norm": 0.17787523567676544, "kl": 0.003602780296932906, "learning_rate": 4.948351554413879e-06, "loss": 0.0001, "reward": 0.753125, "reward_std": 0.31493050456047056, "rewards/custom_reward_simplified_v7_dblog": 0.753125, "step": 380 }, { "completion_length": 656.94375, "epoch": 0.0031060600027078473, "grad_norm": 0.18550129234790802, "kl": 0.003282526368275285, "learning_rate": 4.9407400177998335e-06, "loss": 0.0001, "reward": 0.828125, "reward_std": 0.33323406875133516, "rewards/custom_reward_simplified_v7_dblog": 0.828125, "step": 390 }, { "completion_length": 740.3125, "epoch": 0.0031857025668798433, "grad_norm": 0.19987954199314117, "kl": 0.003102585405576974, "learning_rate": 4.93261217644956e-06, "loss": 0.0001, "reward": 0.590625, "reward_std": 0.26303397938609124, "rewards/custom_reward_simplified_v7_dblog": 0.590625, "step": 400 }, { "completion_length": 641.26875, "epoch": 0.0032653451310518393, "grad_norm": 0.21161562204360962, "kl": 0.003351045388262719, "learning_rate": 4.9239697496952904e-06, "loss": 0.0001, "reward": 0.909375, "reward_std": 0.3579762116074562, "rewards/custom_reward_simplified_v7_dblog": 0.909375, "step": 410 }, { "completion_length": 692.06875, "epoch": 0.0033449876952238353, "grad_norm": 0.17584940791130066, "kl": 0.003339459316339344, "learning_rate": 4.914814565722671e-06, "loss": 0.0001, "reward": 0.765625, "reward_std": 0.3109076008200645, "rewards/custom_reward_simplified_v7_dblog": 0.765625, "step": 420 }, { "completion_length": 638.51875, "epoch": 0.0034246302593958313, "grad_norm": 0.17778904736042023, "kl": 0.0034626491484232246, "learning_rate": 4.905148561184033e-06, "loss": 0.0001, "reward": 0.671875, "reward_std": 0.2665500298142433, "rewards/custom_reward_simplified_v7_dblog": 0.671875, "step": 430 }, { "completion_length": 597.475, "epoch": 0.0035042728235678278, "grad_norm": 0.127123162150383, "kl": 0.0039646215736866, "learning_rate": 4.894973780788722e-06, "loss": 0.0002, "reward": 0.88125, "reward_std": 0.28942874893546106, "rewards/custom_reward_simplified_v7_dblog": 0.88125, "step": 440 }, { "completion_length": 651.6625, "epoch": 0.0035839153877398238, "grad_norm": 0.21087874472141266, "kl": 0.004210945626255125, "learning_rate": 4.884292376870567e-06, "loss": 0.0002, "reward": 0.753125, "reward_std": 0.29777742698788645, "rewards/custom_reward_simplified_v7_dblog": 0.753125, "step": 450 }, { "completion_length": 727.13125, "epoch": 0.0036635579519118198, "grad_norm": 0.18630079925060272, "kl": 0.003935616160742938, "learning_rate": 4.873106608932585e-06, "loss": 0.0002, "reward": 0.678125, "reward_std": 0.31932896226644514, "rewards/custom_reward_simplified_v7_dblog": 0.678125, "step": 460 }, { "completion_length": 716.74375, "epoch": 0.003743200516083816, "grad_norm": 0.1637570858001709, "kl": 0.004373999196104705, "learning_rate": 4.861418843169012e-06, "loss": 0.0002, "reward": 0.646875, "reward_std": 0.26624983847141265, "rewards/custom_reward_simplified_v7_dblog": 0.646875, "step": 470 }, { "completion_length": 581.90625, "epoch": 0.003822843080255812, "grad_norm": 0.0051241409964859486, "kl": 0.004909415659494698, "learning_rate": 4.849231551964771e-06, "loss": 0.0002, "reward": 0.75625, "reward_std": 0.19474873542785645, "rewards/custom_reward_simplified_v7_dblog": 0.75625, "step": 480 }, { "completion_length": 680.94375, "epoch": 0.003902485644427808, "grad_norm": 0.15670013427734375, "kl": 0.004694941581692547, "learning_rate": 4.836547313372472e-06, "loss": 0.0002, "reward": 0.73125, "reward_std": 0.2675834000110626, "rewards/custom_reward_simplified_v7_dblog": 0.73125, "step": 490 }, { "completion_length": 699.1, "epoch": 0.003982128208599804, "grad_norm": 0.1365301012992859, "kl": 0.00405421577161178, "learning_rate": 4.823368810567056e-06, "loss": 0.0002, "reward": 0.603125, "reward_std": 0.25718758851289747, "rewards/custom_reward_simplified_v7_dblog": 0.603125, "step": 500 }, { "completion_length": 646.7, "epoch": 0.0040617707727718, "grad_norm": 0.14925876259803772, "kl": 0.003934591950383037, "learning_rate": 4.809698831278217e-06, "loss": 0.0002, "reward": 0.734375, "reward_std": 0.2696119427680969, "rewards/custom_reward_simplified_v7_dblog": 0.734375, "step": 510 }, { "completion_length": 726.08125, "epoch": 0.004141413336943796, "grad_norm": 0.2107785940170288, "kl": 0.004233359964564443, "learning_rate": 4.7955402672006855e-06, "loss": 0.0002, "reward": 0.759375, "reward_std": 0.2953102938830853, "rewards/custom_reward_simplified_v7_dblog": 0.759375, "step": 520 }, { "completion_length": 633.525, "epoch": 0.004221055901115793, "grad_norm": 0.2159271538257599, "kl": 0.004929024970624596, "learning_rate": 4.780896113382536e-06, "loss": 0.0002, "reward": 0.75625, "reward_std": 0.2647860750555992, "rewards/custom_reward_simplified_v7_dblog": 0.75625, "step": 530 }, { "completion_length": 586.9125, "epoch": 0.004300698465287789, "grad_norm": 0.2394983470439911, "kl": 0.004724201350472868, "learning_rate": 4.765769467591626e-06, "loss": 0.0002, "reward": 0.975, "reward_std": 0.36022927314043046, "rewards/custom_reward_simplified_v7_dblog": 0.975, "step": 540 }, { "completion_length": 651.88125, "epoch": 0.004380341029459785, "grad_norm": 0.1552504301071167, "kl": 0.004269527771975845, "learning_rate": 4.750163529660303e-06, "loss": 0.0002, "reward": 0.790625, "reward_std": 0.2759058982133865, "rewards/custom_reward_simplified_v7_dblog": 0.790625, "step": 550 }, { "completion_length": 655.9125, "epoch": 0.004459983593631781, "grad_norm": 0.13005749881267548, "kl": 0.004541868972592056, "learning_rate": 4.734081600808531e-06, "loss": 0.0002, "reward": 0.796875, "reward_std": 0.2369130529463291, "rewards/custom_reward_simplified_v7_dblog": 0.796875, "step": 560 }, { "completion_length": 630.3375, "epoch": 0.004539626157803777, "grad_norm": 0.14732114970684052, "kl": 0.004577037692070007, "learning_rate": 4.717527082945555e-06, "loss": 0.0002, "reward": 0.925, "reward_std": 0.3310479797422886, "rewards/custom_reward_simplified_v7_dblog": 0.925, "step": 570 }, { "completion_length": 693.2625, "epoch": 0.004619268721975773, "grad_norm": 0.11388376355171204, "kl": 0.004154781624674797, "learning_rate": 4.700503477950278e-06, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.29332098439335824, "rewards/custom_reward_simplified_v7_dblog": 0.6875, "step": 580 }, { "completion_length": 662.7625, "epoch": 0.004698911286147769, "grad_norm": 0.15470421314239502, "kl": 0.00541011628229171, "learning_rate": 4.6830143869304904e-06, "loss": 0.0002, "reward": 0.809375, "reward_std": 0.32753978818655016, "rewards/custom_reward_simplified_v7_dblog": 0.809375, "step": 590 }, { "completion_length": 698.95625, "epoch": 0.004778553850319765, "grad_norm": 0.004228990990668535, "kl": 0.004637495230417699, "learning_rate": 4.665063509461098e-06, "loss": 0.0002, "reward": 0.75, "reward_std": 0.23772156983613968, "rewards/custom_reward_simplified_v7_dblog": 0.75, "step": 600 }, { "completion_length": 629.7625, "epoch": 0.004858196414491761, "grad_norm": 0.21860064566135406, "kl": 0.0044788535917177795, "learning_rate": 4.646654642801533e-06, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.27716630697250366, "rewards/custom_reward_simplified_v7_dblog": 0.8125, "step": 610 }, { "completion_length": 727.06875, "epoch": 0.004937838978663757, "grad_norm": 0.1765265315771103, "kl": 0.004957099666353315, "learning_rate": 4.627791681092499e-06, "loss": 0.0002, "reward": 0.6, "reward_std": 0.2689620770514011, "rewards/custom_reward_simplified_v7_dblog": 0.6, "step": 620 }, { "completion_length": 718.825, "epoch": 0.005017481542835753, "grad_norm": 0.12771090865135193, "kl": 0.005165508517529815, "learning_rate": 4.608478614532215e-06, "loss": 0.0002, "reward": 0.728125, "reward_std": 0.3053886480629444, "rewards/custom_reward_simplified_v7_dblog": 0.728125, "step": 630 }, { "completion_length": 629.525, "epoch": 0.00509712410700775, "grad_norm": 0.17840693891048431, "kl": 0.005059469246771186, "learning_rate": 4.588719528532342e-06, "loss": 0.0002, "reward": 0.721875, "reward_std": 0.298052953928709, "rewards/custom_reward_simplified_v7_dblog": 0.721875, "step": 640 }, { "completion_length": 668.68125, "epoch": 0.005176766671179746, "grad_norm": 0.12746350467205048, "kl": 0.004331990797072649, "learning_rate": 4.568518602853776e-06, "loss": 0.0002, "reward": 0.746875, "reward_std": 0.22913563549518584, "rewards/custom_reward_simplified_v7_dblog": 0.746875, "step": 650 }, { "completion_length": 734.9875, "epoch": 0.005256409235351742, "grad_norm": 0.19717195630073547, "kl": 0.00479215239174664, "learning_rate": 4.54788011072248e-06, "loss": 0.0002, "reward": 0.784375, "reward_std": 0.4230809181928635, "rewards/custom_reward_simplified_v7_dblog": 0.784375, "step": 660 }, { "completion_length": 658.29375, "epoch": 0.005336051799523738, "grad_norm": 0.2698514759540558, "kl": 0.004821322776842862, "learning_rate": 4.526808417925531e-06, "loss": 0.0002, "reward": 0.81875, "reward_std": 0.26030006259679794, "rewards/custom_reward_simplified_v7_dblog": 0.81875, "step": 670 }, { "completion_length": 696.30625, "epoch": 0.005415694363695734, "grad_norm": 0.2144252061843872, "kl": 0.005292760988231749, "learning_rate": 4.50530798188761e-06, "loss": 0.0002, "reward": 0.609375, "reward_std": 0.2595392823219299, "rewards/custom_reward_simplified_v7_dblog": 0.609375, "step": 680 }, { "completion_length": 696.99375, "epoch": 0.00549533692786773, "grad_norm": 0.006262101698666811, "kl": 0.005413674132432789, "learning_rate": 4.4833833507280884e-06, "loss": 0.0002, "reward": 0.684375, "reward_std": 0.24843912497162818, "rewards/custom_reward_simplified_v7_dblog": 0.684375, "step": 690 }, { "completion_length": 675.50625, "epoch": 0.005574979492039726, "grad_norm": 0.16301825642585754, "kl": 0.005892223375849426, "learning_rate": 4.46103916229894e-06, "loss": 0.0002, "reward": 0.80625, "reward_std": 0.34091843143105505, "rewards/custom_reward_simplified_v7_dblog": 0.80625, "step": 700 }, { "completion_length": 725.675, "epoch": 0.005654622056211722, "grad_norm": 0.18473494052886963, "kl": 0.005652935197576881, "learning_rate": 4.438280143203665e-06, "loss": 0.0002, "reward": 0.66875, "reward_std": 0.216452856361866, "rewards/custom_reward_simplified_v7_dblog": 0.66875, "step": 710 }, { "completion_length": 764.26875, "epoch": 0.005734264620383718, "grad_norm": 0.17735017836093903, "kl": 0.005824547982774675, "learning_rate": 4.415111107797445e-06, "loss": 0.0002, "reward": 0.634375, "reward_std": 0.25477964654564855, "rewards/custom_reward_simplified_v7_dblog": 0.634375, "step": 720 }, { "completion_length": 607.86875, "epoch": 0.005813907184555714, "grad_norm": 0.20680995285511017, "kl": 0.0055589195340871814, "learning_rate": 4.391536957168733e-06, "loss": 0.0002, "reward": 0.8, "reward_std": 0.32480863481760025, "rewards/custom_reward_simplified_v7_dblog": 0.8, "step": 730 }, { "completion_length": 674.13125, "epoch": 0.00589354974872771, "grad_norm": 0.005594769027084112, "kl": 0.005972519854549318, "learning_rate": 4.367562678102491e-06, "loss": 0.0002, "reward": 0.665625, "reward_std": 0.20820673778653145, "rewards/custom_reward_simplified_v7_dblog": 0.665625, "step": 740 }, { "completion_length": 639.69375, "epoch": 0.005973192312899706, "grad_norm": 0.11012833565473557, "kl": 0.005814655229914934, "learning_rate": 4.34319334202531e-06, "loss": 0.0002, "reward": 0.796875, "reward_std": 0.34761993661522866, "rewards/custom_reward_simplified_v7_dblog": 0.796875, "step": 750 }, { "completion_length": 587.6, "epoch": 0.006052834877071703, "grad_norm": 0.2750849723815918, "kl": 0.006217251974157989, "learning_rate": 4.318434103932622e-06, "loss": 0.0002, "reward": 0.75625, "reward_std": 0.23903784826397895, "rewards/custom_reward_simplified_v7_dblog": 0.75625, "step": 760 }, { "completion_length": 691.45625, "epoch": 0.006132477441243699, "grad_norm": 0.12792551517486572, "kl": 0.005762395297642798, "learning_rate": 4.293290201298224e-06, "loss": 0.0002, "reward": 0.65, "reward_std": 0.282283828407526, "rewards/custom_reward_simplified_v7_dblog": 0.65, "step": 770 }, { "completion_length": 634.79375, "epoch": 0.006212120005415695, "grad_norm": 0.11762549728155136, "kl": 0.005472023575566709, "learning_rate": 4.267766952966369e-06, "loss": 0.0002, "reward": 0.878125, "reward_std": 0.31506996527314185, "rewards/custom_reward_simplified_v7_dblog": 0.878125, "step": 780 }, { "completion_length": 719.05625, "epoch": 0.006291762569587691, "grad_norm": 0.0052847606129944324, "kl": 0.006504135020077228, "learning_rate": 4.241869758026638e-06, "loss": 0.0003, "reward": 0.628125, "reward_std": 0.2685270056128502, "rewards/custom_reward_simplified_v7_dblog": 0.628125, "step": 790 }, { "completion_length": 699.19375, "epoch": 0.006371405133759687, "grad_norm": 0.2003583461046219, "kl": 0.005931918846908957, "learning_rate": 4.215604094671835e-06, "loss": 0.0002, "reward": 0.746875, "reward_std": 0.25832219421863556, "rewards/custom_reward_simplified_v7_dblog": 0.746875, "step": 800 }, { "completion_length": 652.925, "epoch": 0.006451047697931683, "grad_norm": 0.0062674470245838165, "kl": 0.006221415114123374, "learning_rate": 4.188975519039151e-06, "loss": 0.0002, "reward": 0.73125, "reward_std": 0.3172403134405613, "rewards/custom_reward_simplified_v7_dblog": 0.73125, "step": 810 }, { "completion_length": 668.63125, "epoch": 0.006530690262103679, "grad_norm": 0.13624051213264465, "kl": 0.0063671735813841225, "learning_rate": 4.161989664034844e-06, "loss": 0.0003, "reward": 0.684375, "reward_std": 0.24903304055333136, "rewards/custom_reward_simplified_v7_dblog": 0.684375, "step": 820 }, { "completion_length": 658.575, "epoch": 0.006610332826275675, "grad_norm": 0.2923766076564789, "kl": 0.0068331335205584764, "learning_rate": 4.134652238142674e-06, "loss": 0.0003, "reward": 0.73125, "reward_std": 0.3243869088590145, "rewards/custom_reward_simplified_v7_dblog": 0.73125, "step": 830 }, { "completion_length": 645.31875, "epoch": 0.006689975390447671, "grad_norm": 0.22414511442184448, "kl": 0.006329123536124826, "learning_rate": 4.106969024216348e-06, "loss": 0.0003, "reward": 0.728125, "reward_std": 0.2578707054257393, "rewards/custom_reward_simplified_v7_dblog": 0.728125, "step": 840 }, { "completion_length": 620.76875, "epoch": 0.006769617954619667, "grad_norm": 0.2500353455543518, "kl": 0.006427089823409915, "learning_rate": 4.078945878256244e-06, "loss": 0.0003, "reward": 0.85625, "reward_std": 0.3704014003276825, "rewards/custom_reward_simplified_v7_dblog": 0.85625, "step": 850 }, { "completion_length": 545.075, "epoch": 0.006849260518791663, "grad_norm": 0.18576188385486603, "kl": 0.005737546656746417, "learning_rate": 4.0505887281706505e-06, "loss": 0.0002, "reward": 0.9125, "reward_std": 0.27787805944681165, "rewards/custom_reward_simplified_v7_dblog": 0.9125, "step": 860 }, { "completion_length": 671.1375, "epoch": 0.0069289030829636595, "grad_norm": 0.27761420607566833, "kl": 0.005926149617880583, "learning_rate": 4.021903572521802e-06, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.1984293892979622, "rewards/custom_reward_simplified_v7_dblog": 0.71875, "step": 870 }, { "completion_length": 591.325, "epoch": 0.0070085456471356555, "grad_norm": 0.12898898124694824, "kl": 0.006013317289762199, "learning_rate": 3.992896479256966e-06, "loss": 0.0002, "reward": 0.875, "reward_std": 0.31373453289270403, "rewards/custom_reward_simplified_v7_dblog": 0.875, "step": 880 }, { "completion_length": 709.64375, "epoch": 0.0070881882113076515, "grad_norm": 0.1858564019203186, "kl": 0.006654553860425949, "learning_rate": 3.963573584424852e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.40053595080971716, "rewards/custom_reward_simplified_v7_dblog": 0.875, "step": 890 }, { "completion_length": 693.86875, "epoch": 0.0071678307754796475, "grad_norm": 0.23618744313716888, "kl": 0.006588698271661997, "learning_rate": 3.933941090877615e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.22922600656747819, "rewards/custom_reward_simplified_v7_dblog": 0.6875, "step": 900 }, { "completion_length": 655.1, "epoch": 0.0072474733396516436, "grad_norm": 0.18607589602470398, "kl": 0.006554636568762362, "learning_rate": 3.9040052669587325e-06, "loss": 0.0003, "reward": 0.79375, "reward_std": 0.26788339093327523, "rewards/custom_reward_simplified_v7_dblog": 0.79375, "step": 910 }, { "completion_length": 678.36875, "epoch": 0.0073271159038236396, "grad_norm": 0.15605397522449493, "kl": 0.006827571708709001, "learning_rate": 3.8737724451770155e-06, "loss": 0.0003, "reward": 0.74375, "reward_std": 0.25242582634091376, "rewards/custom_reward_simplified_v7_dblog": 0.74375, "step": 920 }, { "completion_length": 640.1875, "epoch": 0.0074067584679956356, "grad_norm": 0.22241215407848358, "kl": 0.006700195767916739, "learning_rate": 3.8432490208670605e-06, "loss": 0.0003, "reward": 0.753125, "reward_std": 0.30004683434963225, "rewards/custom_reward_simplified_v7_dblog": 0.753125, "step": 930 }, { "completion_length": 671.025, "epoch": 0.007486401032167632, "grad_norm": 0.2610742747783661, "kl": 0.007203501905314625, "learning_rate": 3.8124414508364005e-06, "loss": 0.0003, "reward": 0.696875, "reward_std": 0.2809624969959259, "rewards/custom_reward_simplified_v7_dblog": 0.696875, "step": 940 }, { "completion_length": 644.56875, "epoch": 0.007566043596339628, "grad_norm": 0.18431080877780914, "kl": 0.006376700336113572, "learning_rate": 3.7813562519996633e-06, "loss": 0.0003, "reward": 0.775, "reward_std": 0.2690692335367203, "rewards/custom_reward_simplified_v7_dblog": 0.775, "step": 950 }, { "completion_length": 706.7125, "epoch": 0.007645686160511624, "grad_norm": 0.11362796276807785, "kl": 0.0065676989033818245, "learning_rate": 3.7500000000000005e-06, "loss": 0.0003, "reward": 0.753125, "reward_std": 0.3238763153553009, "rewards/custom_reward_simplified_v7_dblog": 0.753125, "step": 960 }, { "completion_length": 591.60625, "epoch": 0.00772532872468362, "grad_norm": 0.006601857952773571, "kl": 0.0061999865574762225, "learning_rate": 3.7183793278181063e-06, "loss": 0.0002, "reward": 0.978125, "reward_std": 0.32862835973501203, "rewards/custom_reward_simplified_v7_dblog": 0.978125, "step": 970 }, { "completion_length": 623.40625, "epoch": 0.007804971288855616, "grad_norm": 0.24265889823436737, "kl": 0.006443582929205149, "learning_rate": 3.6865009243691015e-06, "loss": 0.0003, "reward": 0.790625, "reward_std": 0.35499989837408064, "rewards/custom_reward_simplified_v7_dblog": 0.790625, "step": 980 }, { "completion_length": 677.65625, "epoch": 0.007884613853027612, "grad_norm": 0.23094038665294647, "kl": 0.006802499154582619, "learning_rate": 3.654371533087586e-06, "loss": 0.0003, "reward": 0.80625, "reward_std": 0.3126889310777187, "rewards/custom_reward_simplified_v7_dblog": 0.80625, "step": 990 }, { "completion_length": 703.0125, "epoch": 0.007964256417199608, "grad_norm": 0.2269383817911148, "kl": 0.006587388808839023, "learning_rate": 3.621997950501156e-06, "loss": 0.0003, "reward": 0.83125, "reward_std": 0.3684743233025074, "rewards/custom_reward_simplified_v7_dblog": 0.83125, "step": 1000 }, { "completion_length": 702.75, "epoch": 0.008043898981371604, "grad_norm": 0.25571930408477783, "kl": 0.0066094894893467425, "learning_rate": 3.5893870247926986e-06, "loss": 0.0003, "reward": 0.690625, "reward_std": 0.27608626931905744, "rewards/custom_reward_simplified_v7_dblog": 0.690625, "step": 1010 }, { "completion_length": 634.18125, "epoch": 0.0081235415455436, "grad_norm": 0.006109423469752073, "kl": 0.006831615581177175, "learning_rate": 3.556545654351749e-06, "loss": 0.0003, "reward": 0.85625, "reward_std": 0.2714505262672901, "rewards/custom_reward_simplified_v7_dblog": 0.85625, "step": 1020 }, { "completion_length": 768.13125, "epoch": 0.008203184109715597, "grad_norm": 0.20112627744674683, "kl": 0.006995444605126977, "learning_rate": 3.5234807863152316e-06, "loss": 0.0003, "reward": 0.609375, "reward_std": 0.2496856138110161, "rewards/custom_reward_simplified_v7_dblog": 0.609375, "step": 1030 }, { "completion_length": 777.525, "epoch": 0.008282826673887592, "grad_norm": 0.2836349606513977, "kl": 0.007392951846122741, "learning_rate": 3.4901994150978926e-06, "loss": 0.0003, "reward": 0.675, "reward_std": 0.26406350955367086, "rewards/custom_reward_simplified_v7_dblog": 0.675, "step": 1040 }, { "completion_length": 719.2875, "epoch": 0.008362469238059589, "grad_norm": 0.1799333542585373, "kl": 0.007057315914425999, "learning_rate": 3.4567085809127247e-06, "loss": 0.0003, "reward": 0.790625, "reward_std": 0.33950999528169634, "rewards/custom_reward_simplified_v7_dblog": 0.790625, "step": 1050 }, { "completion_length": 621.4875, "epoch": 0.008442111802231585, "grad_norm": 0.25109627842903137, "kl": 0.006540448497980833, "learning_rate": 3.4230153682817112e-06, "loss": 0.0003, "reward": 0.85, "reward_std": 0.30627945214509966, "rewards/custom_reward_simplified_v7_dblog": 0.85, "step": 1060 }, { "completion_length": 671.04375, "epoch": 0.00852175436640358, "grad_norm": 0.1299162656068802, "kl": 0.006574284215457737, "learning_rate": 3.389126904537192e-06, "loss": 0.0003, "reward": 0.865625, "reward_std": 0.37070034593343737, "rewards/custom_reward_simplified_v7_dblog": 0.865625, "step": 1070 }, { "completion_length": 638.23125, "epoch": 0.008601396930575577, "grad_norm": 0.23796696960926056, "kl": 0.0075248789740726355, "learning_rate": 3.3550503583141726e-06, "loss": 0.0003, "reward": 0.746875, "reward_std": 0.25020881071686746, "rewards/custom_reward_simplified_v7_dblog": 0.746875, "step": 1080 }, { "completion_length": 634.9375, "epoch": 0.008681039494747573, "grad_norm": 0.2958204448223114, "kl": 0.006533738202415406, "learning_rate": 3.3207929380339034e-06, "loss": 0.0003, "reward": 0.896875, "reward_std": 0.38549663573503495, "rewards/custom_reward_simplified_v7_dblog": 0.896875, "step": 1090 }, { "completion_length": 661.2625, "epoch": 0.00876068205891957, "grad_norm": 0.007367302197962999, "kl": 0.007355101336725056, "learning_rate": 3.2863618903790346e-06, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.25932966247200967, "rewards/custom_reward_simplified_v7_dblog": 0.71875, "step": 1100 }, { "completion_length": 700.48125, "epoch": 0.008840324623091565, "grad_norm": 0.28138336539268494, "kl": 0.007267917576245964, "learning_rate": 3.2517644987606827e-06, "loss": 0.0003, "reward": 0.9125, "reward_std": 0.33715927675366403, "rewards/custom_reward_simplified_v7_dblog": 0.9125, "step": 1110 }, { "completion_length": 662.26875, "epoch": 0.008919967187263561, "grad_norm": 0.1348627209663391, "kl": 0.007481782068498433, "learning_rate": 3.217008081777726e-06, "loss": 0.0003, "reward": 0.728125, "reward_std": 0.2547163799405098, "rewards/custom_reward_simplified_v7_dblog": 0.728125, "step": 1120 }, { "completion_length": 733.2125, "epoch": 0.008999609751435557, "grad_norm": 0.2320898026227951, "kl": 0.007608366897329688, "learning_rate": 3.182099991668653e-06, "loss": 0.0003, "reward": 0.60625, "reward_std": 0.2975068032741547, "rewards/custom_reward_simplified_v7_dblog": 0.60625, "step": 1130 }, { "completion_length": 603.5, "epoch": 0.009079252315607553, "grad_norm": 0.23401154577732086, "kl": 0.007222792156971991, "learning_rate": 3.147047612756302e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.2553515017032623, "rewards/custom_reward_simplified_v7_dblog": 0.875, "step": 1140 }, { "completion_length": 704.44375, "epoch": 0.009158894879779549, "grad_norm": 0.2538968324661255, "kl": 0.007968966104090213, "learning_rate": 3.1118583598858097e-06, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.29204289317131044, "rewards/custom_reward_simplified_v7_dblog": 0.6875, "step": 1150 }, { "completion_length": 641.88125, "epoch": 0.009238537443951545, "grad_norm": 0.007003675680607557, "kl": 0.007272082474082708, "learning_rate": 3.0765396768561005e-06, "loss": 0.0003, "reward": 0.875, "reward_std": 0.2666669487953186, "rewards/custom_reward_simplified_v7_dblog": 0.875, "step": 1160 }, { "completion_length": 645.55625, "epoch": 0.009318180008123542, "grad_norm": 0.005993107333779335, "kl": 0.00769920782186091, "learning_rate": 3.0410990348452572e-06, "loss": 0.0003, "reward": 0.846875, "reward_std": 0.29315834268927576, "rewards/custom_reward_simplified_v7_dblog": 0.846875, "step": 1170 }, { "completion_length": 690.65625, "epoch": 0.009397822572295537, "grad_norm": 0.196693554520607, "kl": 0.007807633420452475, "learning_rate": 3.0055439308300954e-06, "loss": 0.0003, "reward": 0.80625, "reward_std": 0.34684801325201986, "rewards/custom_reward_simplified_v7_dblog": 0.80625, "step": 1180 }, { "completion_length": 652.125, "epoch": 0.009477465136467534, "grad_norm": 0.009493391960859299, "kl": 0.008702660608105362, "learning_rate": 2.96988188600028e-06, "loss": 0.0003, "reward": 0.85625, "reward_std": 0.21074047386646272, "rewards/custom_reward_simplified_v7_dblog": 0.85625, "step": 1190 }, { "completion_length": 660.8125, "epoch": 0.00955710770063953, "grad_norm": 0.250519335269928, "kl": 0.008729650382883846, "learning_rate": 2.9341204441673267e-06, "loss": 0.0003, "reward": 0.728125, "reward_std": 0.33152099549770353, "rewards/custom_reward_simplified_v7_dblog": 0.728125, "step": 1200 }, { "completion_length": 660.7375, "epoch": 0.009636750264811526, "grad_norm": 0.20679971575737, "kl": 0.00826664932537824, "learning_rate": 2.898267170168807e-06, "loss": 0.0003, "reward": 0.665625, "reward_std": 0.25403511226177217, "rewards/custom_reward_simplified_v7_dblog": 0.665625, "step": 1210 }, { "completion_length": 653.59375, "epoch": 0.009716392828983521, "grad_norm": 0.14609546959400177, "kl": 0.007603704649955034, "learning_rate": 2.862329648268117e-06, "loss": 0.0003, "reward": 0.94375, "reward_std": 0.26154626756906507, "rewards/custom_reward_simplified_v7_dblog": 0.94375, "step": 1220 }, { "completion_length": 635.0125, "epoch": 0.009796035393155518, "grad_norm": 0.14301441609859467, "kl": 0.008189951698295773, "learning_rate": 2.82631548055013e-06, "loss": 0.0003, "reward": 0.9, "reward_std": 0.2126667931675911, "rewards/custom_reward_simplified_v7_dblog": 0.9, "step": 1230 }, { "completion_length": 816.5375, "epoch": 0.009875677957327514, "grad_norm": 0.1681988686323166, "kl": 0.01006167777813971, "learning_rate": 2.7902322853130758e-06, "loss": 0.0004, "reward": 0.51875, "reward_std": 0.27570038065314295, "rewards/custom_reward_simplified_v7_dblog": 0.51875, "step": 1240 }, { "completion_length": 710.75, "epoch": 0.00995532052149951, "grad_norm": 0.09834864735603333, "kl": 0.010588118969462813, "learning_rate": 2.754087695457005e-06, "loss": 0.0004, "reward": 0.6625, "reward_std": 0.19232839569449425, "rewards/custom_reward_simplified_v7_dblog": 0.6625, "step": 1250 }, { "completion_length": 615.5875, "epoch": 0.010034963085671506, "grad_norm": 0.14006367325782776, "kl": 0.008278649020940065, "learning_rate": 2.717889356869146e-06, "loss": 0.0003, "reward": 0.903125, "reward_std": 0.3407335430383682, "rewards/custom_reward_simplified_v7_dblog": 0.903125, "step": 1260 }, { "completion_length": 727.70625, "epoch": 0.010114605649843502, "grad_norm": 0.005724642425775528, "kl": 0.009203878976404668, "learning_rate": 2.681644926806527e-06, "loss": 0.0004, "reward": 0.60625, "reward_std": 0.2156815566122532, "rewards/custom_reward_simplified_v7_dblog": 0.60625, "step": 1270 }, { "completion_length": 641.9125, "epoch": 0.0101942482140155, "grad_norm": 0.21494239568710327, "kl": 0.008675340004265309, "learning_rate": 2.6453620722761897e-06, "loss": 0.0003, "reward": 0.81875, "reward_std": 0.22831376343965532, "rewards/custom_reward_simplified_v7_dblog": 0.81875, "step": 1280 }, { "completion_length": 650.5, "epoch": 0.010273890778187494, "grad_norm": 0.22972695529460907, "kl": 0.008116158202756196, "learning_rate": 2.6090484684133406e-06, "loss": 0.0003, "reward": 0.921875, "reward_std": 0.2564812809228897, "rewards/custom_reward_simplified_v7_dblog": 0.921875, "step": 1290 }, { "completion_length": 657.94375, "epoch": 0.010353533342359491, "grad_norm": 0.15338486433029175, "kl": 0.009256175020709634, "learning_rate": 2.572711796857779e-06, "loss": 0.0004, "reward": 0.709375, "reward_std": 0.21537503451108933, "rewards/custom_reward_simplified_v7_dblog": 0.709375, "step": 1300 }, { "completion_length": 650.58125, "epoch": 0.010433175906531486, "grad_norm": 0.14920295774936676, "kl": 0.009564152918756008, "learning_rate": 2.5363597441289574e-06, "loss": 0.0004, "reward": 0.828125, "reward_std": 0.2882704295217991, "rewards/custom_reward_simplified_v7_dblog": 0.828125, "step": 1310 }, { "completion_length": 723.89375, "epoch": 0.010512818470703483, "grad_norm": 0.20945711433887482, "kl": 0.010788540355861187, "learning_rate": 2.5e-06, "loss": 0.0004, "reward": 0.7125, "reward_std": 0.26380954012274743, "rewards/custom_reward_simplified_v7_dblog": 0.7125, "step": 1320 }, { "completion_length": 715.7, "epoch": 0.010592461034875478, "grad_norm": 0.16817767918109894, "kl": 0.013910629483871163, "learning_rate": 2.4636402558710434e-06, "loss": 0.0006, "reward": 0.759375, "reward_std": 0.2193169414997101, "rewards/custom_reward_simplified_v7_dblog": 0.759375, "step": 1330 }, { "completion_length": 655.90625, "epoch": 0.010672103599047475, "grad_norm": 0.2265154868364334, "kl": 0.00848452327772975, "learning_rate": 2.4272882031422216e-06, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.3443989932537079, "rewards/custom_reward_simplified_v7_dblog": 0.78125, "step": 1340 }, { "completion_length": 660.075, "epoch": 0.01075174616321947, "grad_norm": 0.24644052982330322, "kl": 0.009867909434251487, "learning_rate": 2.3909515315866606e-06, "loss": 0.0004, "reward": 0.79375, "reward_std": 0.29604131579399107, "rewards/custom_reward_simplified_v7_dblog": 0.79375, "step": 1350 }, { "completion_length": 633.9125, "epoch": 0.010831388727391467, "grad_norm": 0.1637645810842514, "kl": 0.00936238830909133, "learning_rate": 2.3546379277238107e-06, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.324691192060709, "rewards/custom_reward_simplified_v7_dblog": 0.90625, "step": 1360 }, { "completion_length": 674.03125, "epoch": 0.010911031291563462, "grad_norm": 0.2471015304327011, "kl": 0.010726678185164928, "learning_rate": 2.318355073193474e-06, "loss": 0.0004, "reward": 0.65625, "reward_std": 0.21728940233588218, "rewards/custom_reward_simplified_v7_dblog": 0.65625, "step": 1370 }, { "completion_length": 682.31875, "epoch": 0.01099067385573546, "grad_norm": 0.10079372674226761, "kl": 0.009952771244570613, "learning_rate": 2.2821106431308546e-06, "loss": 0.0004, "reward": 0.89375, "reward_std": 0.33092204555869104, "rewards/custom_reward_simplified_v7_dblog": 0.89375, "step": 1380 }, { "completion_length": 669.09375, "epoch": 0.011070316419907454, "grad_norm": 0.19604210555553436, "kl": 0.011396997445262968, "learning_rate": 2.2459123045429953e-06, "loss": 0.0005, "reward": 0.784375, "reward_std": 0.29770964160561564, "rewards/custom_reward_simplified_v7_dblog": 0.784375, "step": 1390 }, { "completion_length": 651.80625, "epoch": 0.011149958984079451, "grad_norm": 0.27397212386131287, "kl": 0.01038803206756711, "learning_rate": 2.2097677146869242e-06, "loss": 0.0004, "reward": 0.878125, "reward_std": 0.27883157432079314, "rewards/custom_reward_simplified_v7_dblog": 0.878125, "step": 1400 }, { "completion_length": 687.5125, "epoch": 0.011229601548251448, "grad_norm": 0.22397036850452423, "kl": 0.012094876240007579, "learning_rate": 2.173684519449872e-06, "loss": 0.0005, "reward": 0.834375, "reward_std": 0.28866922557353974, "rewards/custom_reward_simplified_v7_dblog": 0.834375, "step": 1410 }, { "completion_length": 661.26875, "epoch": 0.011309244112423443, "grad_norm": 0.2519758939743042, "kl": 0.011373027227818966, "learning_rate": 2.1376703517318835e-06, "loss": 0.0005, "reward": 0.853125, "reward_std": 0.32343359887599943, "rewards/custom_reward_simplified_v7_dblog": 0.853125, "step": 1420 }, { "completion_length": 677.73125, "epoch": 0.01138888667659544, "grad_norm": 0.2689824104309082, "kl": 0.011312256497330964, "learning_rate": 2.101732829831194e-06, "loss": 0.0005, "reward": 0.765625, "reward_std": 0.27808423787355424, "rewards/custom_reward_simplified_v7_dblog": 0.765625, "step": 1430 }, { "completion_length": 619.71875, "epoch": 0.011468529240767435, "grad_norm": 0.32441073656082153, "kl": 0.010685316193848849, "learning_rate": 2.0658795558326745e-06, "loss": 0.0004, "reward": 0.871875, "reward_std": 0.2622031569480896, "rewards/custom_reward_simplified_v7_dblog": 0.871875, "step": 1440 }, { "completion_length": 613.3875, "epoch": 0.011548171804939432, "grad_norm": 0.15561087429523468, "kl": 0.012302201450802385, "learning_rate": 2.0301181139997206e-06, "loss": 0.0005, "reward": 0.8125, "reward_std": 0.26520399302244185, "rewards/custom_reward_simplified_v7_dblog": 0.8125, "step": 1450 }, { "completion_length": 677.23125, "epoch": 0.011627814369111427, "grad_norm": 0.2590673267841339, "kl": 0.011339499452151357, "learning_rate": 1.994456069169906e-06, "loss": 0.0005, "reward": 0.64375, "reward_std": 0.23993425220251083, "rewards/custom_reward_simplified_v7_dblog": 0.64375, "step": 1460 }, { "completion_length": 702.1625, "epoch": 0.011707456933283424, "grad_norm": 0.012393876910209656, "kl": 0.012036008480936288, "learning_rate": 1.958900965154743e-06, "loss": 0.0005, "reward": 0.64375, "reward_std": 0.21832374781370162, "rewards/custom_reward_simplified_v7_dblog": 0.64375, "step": 1470 }, { "completion_length": 722.06875, "epoch": 0.01178709949745542, "grad_norm": 0.13200955092906952, "kl": 0.013854384049773216, "learning_rate": 1.9234603231439e-06, "loss": 0.0006, "reward": 0.790625, "reward_std": 0.2784456007182598, "rewards/custom_reward_simplified_v7_dblog": 0.790625, "step": 1480 }, { "completion_length": 664.46875, "epoch": 0.011866742061627416, "grad_norm": 0.14230677485466003, "kl": 0.012553655169904232, "learning_rate": 1.8881416401141905e-06, "loss": 0.0005, "reward": 0.9, "reward_std": 0.23252918049693108, "rewards/custom_reward_simplified_v7_dblog": 0.9, "step": 1490 }, { "completion_length": 653.79375, "epoch": 0.011946384625799411, "grad_norm": 0.17014774680137634, "kl": 0.01346926314290613, "learning_rate": 1.852952387243698e-06, "loss": 0.0005, "reward": 0.740625, "reward_std": 0.22115055918693544, "rewards/custom_reward_simplified_v7_dblog": 0.740625, "step": 1500 }, { "completion_length": 640.39375, "epoch": 0.012026027189971408, "grad_norm": 0.17104946076869965, "kl": 0.013007838977500796, "learning_rate": 1.8179000083313483e-06, "loss": 0.0005, "reward": 0.9, "reward_std": 0.28725912123918534, "rewards/custom_reward_simplified_v7_dblog": 0.9, "step": 1510 }, { "completion_length": 650.7125, "epoch": 0.012105669754143405, "grad_norm": 0.1524449735879898, "kl": 0.012339419685304165, "learning_rate": 1.7829919182222752e-06, "loss": 0.0005, "reward": 0.790625, "reward_std": 0.3324665643274784, "rewards/custom_reward_simplified_v7_dblog": 0.790625, "step": 1520 }, { "completion_length": 674.3625, "epoch": 0.0121853123183154, "grad_norm": 0.2344941943883896, "kl": 0.012514100456610323, "learning_rate": 1.7482355012393177e-06, "loss": 0.0005, "reward": 0.859375, "reward_std": 0.3387090668082237, "rewards/custom_reward_simplified_v7_dblog": 0.859375, "step": 1530 }, { "completion_length": 718.6, "epoch": 0.012264954882487397, "grad_norm": 0.2631664276123047, "kl": 0.014576551388017833, "learning_rate": 1.7136381096209665e-06, "loss": 0.0006, "reward": 0.653125, "reward_std": 0.24619419425725936, "rewards/custom_reward_simplified_v7_dblog": 0.653125, "step": 1540 }, { "completion_length": 706.28125, "epoch": 0.012344597446659392, "grad_norm": 0.20134921371936798, "kl": 0.012202254333533346, "learning_rate": 1.6792070619660977e-06, "loss": 0.0005, "reward": 0.84375, "reward_std": 0.3321776181459427, "rewards/custom_reward_simplified_v7_dblog": 0.84375, "step": 1550 }, { "completion_length": 645.28125, "epoch": 0.01242424001083139, "grad_norm": 0.1851159930229187, "kl": 0.014482964109629393, "learning_rate": 1.6449496416858285e-06, "loss": 0.0006, "reward": 0.85625, "reward_std": 0.20507382601499557, "rewards/custom_reward_simplified_v7_dblog": 0.85625, "step": 1560 }, { "completion_length": 614.08125, "epoch": 0.012503882575003384, "grad_norm": 0.27418458461761475, "kl": 0.013118641986511647, "learning_rate": 1.6108730954628093e-06, "loss": 0.0005, "reward": 0.79375, "reward_std": 0.2820776253938675, "rewards/custom_reward_simplified_v7_dblog": 0.79375, "step": 1570 }, { "completion_length": 695.91875, "epoch": 0.012583525139175381, "grad_norm": 0.2425900250673294, "kl": 0.013333506928756834, "learning_rate": 1.5769846317182894e-06, "loss": 0.0005, "reward": 0.7625, "reward_std": 0.2879462748765945, "rewards/custom_reward_simplified_v7_dblog": 0.7625, "step": 1580 }, { "completion_length": 673.99375, "epoch": 0.012663167703347376, "grad_norm": 0.2331763356924057, "kl": 0.013234515953809024, "learning_rate": 1.5432914190872757e-06, "loss": 0.0005, "reward": 0.775, "reward_std": 0.2913659870624542, "rewards/custom_reward_simplified_v7_dblog": 0.775, "step": 1590 }, { "completion_length": 678.54375, "epoch": 0.012742810267519373, "grad_norm": 0.16657988727092743, "kl": 0.012798944069072605, "learning_rate": 1.509800584902108e-06, "loss": 0.0005, "reward": 0.759375, "reward_std": 0.2901748239994049, "rewards/custom_reward_simplified_v7_dblog": 0.759375, "step": 1600 }, { "completion_length": 652.49375, "epoch": 0.012822452831691368, "grad_norm": 0.12168209999799728, "kl": 0.012750855972990393, "learning_rate": 1.4765192136847686e-06, "loss": 0.0005, "reward": 0.728125, "reward_std": 0.26915703564882276, "rewards/custom_reward_simplified_v7_dblog": 0.728125, "step": 1610 }, { "completion_length": 660.95625, "epoch": 0.012902095395863365, "grad_norm": 0.13546766340732574, "kl": 0.013546877074986696, "learning_rate": 1.443454345648252e-06, "loss": 0.0005, "reward": 0.790625, "reward_std": 0.1937400370836258, "rewards/custom_reward_simplified_v7_dblog": 0.790625, "step": 1620 }, { "completion_length": 638.00625, "epoch": 0.012981737960035362, "grad_norm": 0.17955924570560455, "kl": 0.012779112858697771, "learning_rate": 1.4106129752073023e-06, "loss": 0.0005, "reward": 0.790625, "reward_std": 0.2674853280186653, "rewards/custom_reward_simplified_v7_dblog": 0.790625, "step": 1630 }, { "completion_length": 678.1125, "epoch": 0.013061380524207357, "grad_norm": 0.2616170644760132, "kl": 0.01720189054030925, "learning_rate": 1.3780020494988447e-06, "loss": 0.0007, "reward": 0.771875, "reward_std": 0.27255760729312895, "rewards/custom_reward_simplified_v7_dblog": 0.771875, "step": 1640 }, { "completion_length": 639.43125, "epoch": 0.013141023088379354, "grad_norm": 0.1487816423177719, "kl": 0.014415727299638092, "learning_rate": 1.3456284669124159e-06, "loss": 0.0006, "reward": 0.73125, "reward_std": 0.24298151433467866, "rewards/custom_reward_simplified_v7_dblog": 0.73125, "step": 1650 }, { "completion_length": 727.9625, "epoch": 0.01322066565255135, "grad_norm": 0.14750860631465912, "kl": 0.018067248188890515, "learning_rate": 1.313499075630899e-06, "loss": 0.0007, "reward": 0.721875, "reward_std": 0.30838647186756135, "rewards/custom_reward_simplified_v7_dblog": 0.721875, "step": 1660 }, { "completion_length": 780.08125, "epoch": 0.013300308216723346, "grad_norm": 0.2386309951543808, "kl": 0.017110610962845385, "learning_rate": 1.2816206721818944e-06, "loss": 0.0007, "reward": 0.6375, "reward_std": 0.26727318242192266, "rewards/custom_reward_simplified_v7_dblog": 0.6375, "step": 1670 }, { "completion_length": 655.70625, "epoch": 0.013379950780895341, "grad_norm": 0.2751936614513397, "kl": 0.01622524333652109, "learning_rate": 1.2500000000000007e-06, "loss": 0.0006, "reward": 0.878125, "reward_std": 0.284642493724823, "rewards/custom_reward_simplified_v7_dblog": 0.878125, "step": 1680 }, { "completion_length": 684.98125, "epoch": 0.013459593345067338, "grad_norm": 0.23118546605110168, "kl": 0.01642036633566022, "learning_rate": 1.218643748000337e-06, "loss": 0.0007, "reward": 0.85625, "reward_std": 0.339317075163126, "rewards/custom_reward_simplified_v7_dblog": 0.85625, "step": 1690 }, { "completion_length": 743.51875, "epoch": 0.013539235909239333, "grad_norm": 0.22867274284362793, "kl": 0.01721250016707927, "learning_rate": 1.1875585491636e-06, "loss": 0.0007, "reward": 0.653125, "reward_std": 0.277196903526783, "rewards/custom_reward_simplified_v7_dblog": 0.653125, "step": 1700 }, { "completion_length": 637.9625, "epoch": 0.01361887847341133, "grad_norm": 0.2428259700536728, "kl": 0.014563425956293941, "learning_rate": 1.1567509791329402e-06, "loss": 0.0006, "reward": 0.865625, "reward_std": 0.23967689424753189, "rewards/custom_reward_simplified_v7_dblog": 0.865625, "step": 1710 }, { "completion_length": 722.925, "epoch": 0.013698521037583325, "grad_norm": 0.21737752854824066, "kl": 0.014987437543459237, "learning_rate": 1.1262275548229852e-06, "loss": 0.0006, "reward": 0.725, "reward_std": 0.26179009675979614, "rewards/custom_reward_simplified_v7_dblog": 0.725, "step": 1720 }, { "completion_length": 633.31875, "epoch": 0.013778163601755322, "grad_norm": 0.22654354572296143, "kl": 0.013244283269159496, "learning_rate": 1.0959947330412681e-06, "loss": 0.0005, "reward": 0.921875, "reward_std": 0.2066536843776703, "rewards/custom_reward_simplified_v7_dblog": 0.921875, "step": 1730 }, { "completion_length": 615.29375, "epoch": 0.013857806165927319, "grad_norm": 0.22673261165618896, "kl": 0.014753601653501392, "learning_rate": 1.0660589091223854e-06, "loss": 0.0006, "reward": 0.815625, "reward_std": 0.30853241235017775, "rewards/custom_reward_simplified_v7_dblog": 0.815625, "step": 1740 }, { "completion_length": 630.3625, "epoch": 0.013937448730099314, "grad_norm": 0.012196751311421394, "kl": 0.01440229129511863, "learning_rate": 1.0364264155751489e-06, "loss": 0.0006, "reward": 0.915625, "reward_std": 0.23927971720695496, "rewards/custom_reward_simplified_v7_dblog": 0.915625, "step": 1750 }, { "completion_length": 715.2125, "epoch": 0.014017091294271311, "grad_norm": 0.2587921619415283, "kl": 0.017100332980044188, "learning_rate": 1.0071035207430352e-06, "loss": 0.0007, "reward": 0.74375, "reward_std": 0.2990465022623539, "rewards/custom_reward_simplified_v7_dblog": 0.74375, "step": 1760 }, { "completion_length": 682.74375, "epoch": 0.014096733858443306, "grad_norm": 0.24313370883464813, "kl": 0.01778110705781728, "learning_rate": 9.780964274781984e-07, "loss": 0.0007, "reward": 0.68125, "reward_std": 0.2005969136953354, "rewards/custom_reward_simplified_v7_dblog": 0.68125, "step": 1770 }, { "completion_length": 718.31875, "epoch": 0.014176376422615303, "grad_norm": 0.18841393291950226, "kl": 0.015946343122050167, "learning_rate": 9.494112718293503e-07, "loss": 0.0006, "reward": 0.771875, "reward_std": 0.27307887077331544, "rewards/custom_reward_simplified_v7_dblog": 0.771875, "step": 1780 }, { "completion_length": 707.1875, "epoch": 0.014256018986787298, "grad_norm": 0.2333621084690094, "kl": 0.01652351173106581, "learning_rate": 9.210541217437566e-07, "loss": 0.0007, "reward": 0.8125, "reward_std": 0.2497081995010376, "rewards/custom_reward_simplified_v7_dblog": 0.8125, "step": 1790 }, { "completion_length": 728.5375, "epoch": 0.014335661550959295, "grad_norm": 0.26783886551856995, "kl": 0.018553019547834993, "learning_rate": 8.930309757836517e-07, "loss": 0.0007, "reward": 0.75, "reward_std": 0.28967257887125014, "rewards/custom_reward_simplified_v7_dblog": 0.75, "step": 1800 }, { "completion_length": 689.26875, "epoch": 0.01441530411513129, "grad_norm": 0.17589329183101654, "kl": 0.016255489736795425, "learning_rate": 8.653477618573261e-07, "loss": 0.0007, "reward": 0.765625, "reward_std": 0.3363394603133202, "rewards/custom_reward_simplified_v7_dblog": 0.765625, "step": 1810 }, { "completion_length": 640.91875, "epoch": 0.014494946679303287, "grad_norm": 0.21075929701328278, "kl": 0.015922663966193795, "learning_rate": 8.380103359651554e-07, "loss": 0.0006, "reward": 0.925, "reward_std": 0.3459245666861534, "rewards/custom_reward_simplified_v7_dblog": 0.925, "step": 1820 }, { "completion_length": 708.60625, "epoch": 0.014574589243475282, "grad_norm": 0.00766308419406414, "kl": 0.01772608202882111, "learning_rate": 8.110244809608494e-07, "loss": 0.0007, "reward": 0.73125, "reward_std": 0.2913930006325245, "rewards/custom_reward_simplified_v7_dblog": 0.73125, "step": 1830 }, { "completion_length": 660.0375, "epoch": 0.014654231807647279, "grad_norm": 0.20974037051200867, "kl": 0.014227323909290135, "learning_rate": 7.843959053281663e-07, "loss": 0.0006, "reward": 0.809375, "reward_std": 0.24926668480038644, "rewards/custom_reward_simplified_v7_dblog": 0.809375, "step": 1840 }, { "completion_length": 729.71875, "epoch": 0.014733874371819274, "grad_norm": 0.24099427461624146, "kl": 0.018935651518404484, "learning_rate": 7.581302419733633e-07, "loss": 0.0008, "reward": 0.690625, "reward_std": 0.32810748890042307, "rewards/custom_reward_simplified_v7_dblog": 0.690625, "step": 1850 }, { "completion_length": 649.98125, "epoch": 0.014813516935991271, "grad_norm": 0.013280795887112617, "kl": 0.01633880774024874, "learning_rate": 7.322330470336314e-07, "loss": 0.0007, "reward": 0.91875, "reward_std": 0.24432293996214866, "rewards/custom_reward_simplified_v7_dblog": 0.91875, "step": 1860 }, { "completion_length": 669.09375, "epoch": 0.014893159500163268, "grad_norm": 0.2837064266204834, "kl": 0.014348302804864942, "learning_rate": 7.067097987017762e-07, "loss": 0.0006, "reward": 0.690625, "reward_std": 0.2307182878255844, "rewards/custom_reward_simplified_v7_dblog": 0.690625, "step": 1870 }, { "completion_length": 662.9625, "epoch": 0.014972802064335263, "grad_norm": 0.25689443945884705, "kl": 0.01656266492791474, "learning_rate": 6.815658960673782e-07, "loss": 0.0007, "reward": 0.85625, "reward_std": 0.22758262380957603, "rewards/custom_reward_simplified_v7_dblog": 0.85625, "step": 1880 }, { "completion_length": 719.24375, "epoch": 0.01505244462850726, "grad_norm": 0.22542421519756317, "kl": 0.01744127394631505, "learning_rate": 6.568066579746901e-07, "loss": 0.0007, "reward": 0.76875, "reward_std": 0.2790658660233021, "rewards/custom_reward_simplified_v7_dblog": 0.76875, "step": 1890 }, { "completion_length": 633.64375, "epoch": 0.015132087192679255, "grad_norm": 0.00903425831347704, "kl": 0.014375879801809788, "learning_rate": 6.324373218975105e-07, "loss": 0.0006, "reward": 0.725, "reward_std": 0.2382744610309601, "rewards/custom_reward_simplified_v7_dblog": 0.725, "step": 1900 }, { "completion_length": 767.7375, "epoch": 0.015211729756851252, "grad_norm": 0.1330222189426422, "kl": 0.02190765142440796, "learning_rate": 6.084630428312679e-07, "loss": 0.0009, "reward": 0.66875, "reward_std": 0.27546602860093117, "rewards/custom_reward_simplified_v7_dblog": 0.66875, "step": 1910 }, { "completion_length": 726.63125, "epoch": 0.015291372321023247, "grad_norm": 0.21655875444412231, "kl": 0.02581467442214489, "learning_rate": 5.848888922025553e-07, "loss": 0.001, "reward": 0.834375, "reward_std": 0.38373097851872445, "rewards/custom_reward_simplified_v7_dblog": 0.834375, "step": 1920 }, { "completion_length": 688.56875, "epoch": 0.015371014885195244, "grad_norm": 0.22155120968818665, "kl": 0.025313653564080597, "learning_rate": 5.617198567963353e-07, "loss": 0.001, "reward": 0.64375, "reward_std": 0.2539114162325859, "rewards/custom_reward_simplified_v7_dblog": 0.64375, "step": 1930 }, { "completion_length": 676.9125, "epoch": 0.01545065744936724, "grad_norm": 0.2373446673154831, "kl": 0.018907574540935456, "learning_rate": 5.389608377010608e-07, "loss": 0.0008, "reward": 0.821875, "reward_std": 0.1906539335846901, "rewards/custom_reward_simplified_v7_dblog": 0.821875, "step": 1940 }, { "completion_length": 640.675, "epoch": 0.015530300013539236, "grad_norm": 0.1865774542093277, "kl": 0.014899229886941612, "learning_rate": 5.166166492719124e-07, "loss": 0.0006, "reward": 0.725, "reward_std": 0.2747412838041782, "rewards/custom_reward_simplified_v7_dblog": 0.725, "step": 1950 }, { "completion_length": 651.70625, "epoch": 0.015609942577711231, "grad_norm": 0.2434624284505844, "kl": 0.01636054664850235, "learning_rate": 4.946920181123904e-07, "loss": 0.0007, "reward": 0.7625, "reward_std": 0.2852359592914581, "rewards/custom_reward_simplified_v7_dblog": 0.7625, "step": 1960 }, { "completion_length": 654.6625, "epoch": 0.015689585141883226, "grad_norm": 0.20749981701374054, "kl": 0.018196922447532415, "learning_rate": 4.7319158207446953e-07, "loss": 0.0007, "reward": 0.715625, "reward_std": 0.2198973834514618, "rewards/custom_reward_simplified_v7_dblog": 0.715625, "step": 1970 }, { "completion_length": 641.45, "epoch": 0.015769227706055225, "grad_norm": 0.23187489807605743, "kl": 0.017989515024237335, "learning_rate": 4.5211988927752026e-07, "loss": 0.0007, "reward": 0.7875, "reward_std": 0.24450960606336594, "rewards/custom_reward_simplified_v7_dblog": 0.7875, "step": 1980 }, { "completion_length": 643.6375, "epoch": 0.01584887027022722, "grad_norm": 0.235895574092865, "kl": 0.015841626143082977, "learning_rate": 4.3148139714622365e-07, "loss": 0.0006, "reward": 0.896875, "reward_std": 0.26189937368035315, "rewards/custom_reward_simplified_v7_dblog": 0.896875, "step": 1990 }, { "completion_length": 629.60625, "epoch": 0.015928512834399215, "grad_norm": 0.2776155471801758, "kl": 0.015184593386948109, "learning_rate": 4.1128047146765936e-07, "loss": 0.0006, "reward": 0.921875, "reward_std": 0.23378355875611306, "rewards/custom_reward_simplified_v7_dblog": 0.921875, "step": 2000 }, { "completion_length": 710.65, "epoch": 0.016008155398571214, "grad_norm": 0.13598495721817017, "kl": 0.01561300114262849, "learning_rate": 3.915213854677863e-07, "loss": 0.0006, "reward": 0.859375, "reward_std": 0.22324086129665374, "rewards/custom_reward_simplified_v7_dblog": 0.859375, "step": 2010 }, { "completion_length": 600.3625, "epoch": 0.01608779796274321, "grad_norm": 0.33102965354919434, "kl": 0.01562973433174193, "learning_rate": 3.722083189075007e-07, "loss": 0.0006, "reward": 1.0125, "reward_std": 0.37898894101381303, "rewards/custom_reward_simplified_v7_dblog": 1.0125, "step": 2020 }, { "completion_length": 633.40625, "epoch": 0.016167440526915204, "grad_norm": 0.009714637883007526, "kl": 0.01524353977292776, "learning_rate": 3.5334535719846767e-07, "loss": 0.0006, "reward": 0.775, "reward_std": 0.1905590772628784, "rewards/custom_reward_simplified_v7_dblog": 0.775, "step": 2030 }, { "completion_length": 674.3625, "epoch": 0.0162470830910872, "grad_norm": 0.2587895095348358, "kl": 0.015684280125424267, "learning_rate": 3.3493649053890325e-07, "loss": 0.0006, "reward": 0.978125, "reward_std": 0.33772673830389977, "rewards/custom_reward_simplified_v7_dblog": 0.978125, "step": 2040 }, { "completion_length": 623.1375, "epoch": 0.016326725655259198, "grad_norm": 0.24910244345664978, "kl": 0.014677197439596057, "learning_rate": 3.1698561306951065e-07, "loss": 0.0006, "reward": 0.925, "reward_std": 0.3512172996997833, "rewards/custom_reward_simplified_v7_dblog": 0.925, "step": 2050 }, { "completion_length": 678.0375, "epoch": 0.016406368219431193, "grad_norm": 0.20536966621875763, "kl": 0.017746813944540918, "learning_rate": 2.9949652204972257e-07, "loss": 0.0007, "reward": 0.828125, "reward_std": 0.34475562572479246, "rewards/custom_reward_simplified_v7_dblog": 0.828125, "step": 2060 }, { "completion_length": 634.36875, "epoch": 0.016486010783603188, "grad_norm": 0.26798176765441895, "kl": 0.017110086302272974, "learning_rate": 2.8247291705444575e-07, "loss": 0.0007, "reward": 0.89375, "reward_std": 0.24814453721046448, "rewards/custom_reward_simplified_v7_dblog": 0.89375, "step": 2070 }, { "completion_length": 710.6875, "epoch": 0.016565653347775183, "grad_norm": 0.20649504661560059, "kl": 0.018557686172425748, "learning_rate": 2.6591839919146963e-07, "loss": 0.0007, "reward": 0.828125, "reward_std": 0.34967463091015816, "rewards/custom_reward_simplified_v7_dblog": 0.828125, "step": 2080 }, { "completion_length": 642.375, "epoch": 0.016645295911947182, "grad_norm": 0.016043314710259438, "kl": 0.018814650364220142, "learning_rate": 2.4983647033969714e-07, "loss": 0.0008, "reward": 0.859375, "reward_std": 0.3110216066241264, "rewards/custom_reward_simplified_v7_dblog": 0.859375, "step": 2090 }, { "completion_length": 686.65625, "epoch": 0.016724938476119177, "grad_norm": 0.26343393325805664, "kl": 0.019906887435354292, "learning_rate": 2.3423053240837518e-07, "loss": 0.0008, "reward": 0.715625, "reward_std": 0.17099330350756645, "rewards/custom_reward_simplified_v7_dblog": 0.715625, "step": 2100 }, { "completion_length": 656.8, "epoch": 0.016804581040291172, "grad_norm": 0.01307599525898695, "kl": 0.020065448177047075, "learning_rate": 2.1910388661746495e-07, "loss": 0.0008, "reward": 0.8, "reward_std": 0.20212240219116212, "rewards/custom_reward_simplified_v7_dblog": 0.8, "step": 2110 }, { "completion_length": 714.25625, "epoch": 0.01688422360446317, "grad_norm": 0.2202935814857483, "kl": 0.02329984272364527, "learning_rate": 2.044597327993153e-07, "loss": 0.0009, "reward": 0.7875, "reward_std": 0.307485481351614, "rewards/custom_reward_simplified_v7_dblog": 0.7875, "step": 2120 }, { "completion_length": 685.39375, "epoch": 0.016963866168635166, "grad_norm": 0.30204537510871887, "kl": 0.018967814440838993, "learning_rate": 1.9030116872178317e-07, "loss": 0.0008, "reward": 0.803125, "reward_std": 0.3279333204030991, "rewards/custom_reward_simplified_v7_dblog": 0.803125, "step": 2130 }, { "completion_length": 674.49375, "epoch": 0.01704350873280716, "grad_norm": 0.012012571096420288, "kl": 0.02170075795147568, "learning_rate": 1.7663118943294367e-07, "loss": 0.0009, "reward": 0.703125, "reward_std": 0.2257047951221466, "rewards/custom_reward_simplified_v7_dblog": 0.703125, "step": 2140 }, { "completion_length": 694.63125, "epoch": 0.017123151296979156, "grad_norm": 0.01635037176311016, "kl": 0.02094450539443642, "learning_rate": 1.6345268662752904e-07, "loss": 0.0008, "reward": 0.7125, "reward_std": 0.2917635254561901, "rewards/custom_reward_simplified_v7_dblog": 0.7125, "step": 2150 }, { "completion_length": 702.025, "epoch": 0.017202793861151155, "grad_norm": 0.008707295171916485, "kl": 0.01914967515040189, "learning_rate": 1.507684480352292e-07, "loss": 0.0008, "reward": 0.821875, "reward_std": 0.2691307656466961, "rewards/custom_reward_simplified_v7_dblog": 0.821875, "step": 2160 }, { "completion_length": 704.90625, "epoch": 0.01728243642532315, "grad_norm": 0.1347748190164566, "kl": 0.017809830722399056, "learning_rate": 1.3858115683098832e-07, "loss": 0.0007, "reward": 0.9, "reward_std": 0.30937733352184293, "rewards/custom_reward_simplified_v7_dblog": 0.9, "step": 2170 }, { "completion_length": 650.13125, "epoch": 0.017362078989495145, "grad_norm": 0.013826651498675346, "kl": 0.017964964429847897, "learning_rate": 1.2689339106741529e-07, "loss": 0.0007, "reward": 0.821875, "reward_std": 0.2382724992930889, "rewards/custom_reward_simplified_v7_dblog": 0.821875, "step": 2180 }, { "completion_length": 574.075, "epoch": 0.01744172155366714, "grad_norm": 0.21891085803508759, "kl": 0.013470867811702193, "learning_rate": 1.1570762312943295e-07, "loss": 0.0005, "reward": 0.9875, "reward_std": 0.2131643146276474, "rewards/custom_reward_simplified_v7_dblog": 0.9875, "step": 2190 }, { "completion_length": 645.95, "epoch": 0.01752136411783914, "grad_norm": 0.28153711557388306, "kl": 0.01899058516137302, "learning_rate": 1.0502621921127776e-07, "loss": 0.0008, "reward": 0.834375, "reward_std": 0.29732906967401507, "rewards/custom_reward_simplified_v7_dblog": 0.834375, "step": 2200 }, { "completion_length": 618.19375, "epoch": 0.017601006682011134, "grad_norm": 0.25354552268981934, "kl": 0.016854454204440115, "learning_rate": 9.485143881596715e-08, "loss": 0.0007, "reward": 0.85625, "reward_std": 0.25810291022062304, "rewards/custom_reward_simplified_v7_dblog": 0.85625, "step": 2210 }, { "completion_length": 638.425, "epoch": 0.01768064924618313, "grad_norm": 0.2272520810365677, "kl": 0.018312370544299482, "learning_rate": 8.518543427732951e-08, "loss": 0.0007, "reward": 0.753125, "reward_std": 0.2212974861264229, "rewards/custom_reward_simplified_v7_dblog": 0.753125, "step": 2220 }, { "completion_length": 695.54375, "epoch": 0.017760291810355128, "grad_norm": 0.27871131896972656, "kl": 0.02111934470012784, "learning_rate": 7.603025030471001e-08, "loss": 0.0008, "reward": 0.75, "reward_std": 0.2767858363687992, "rewards/custom_reward_simplified_v7_dblog": 0.75, "step": 2230 }, { "completion_length": 632.99375, "epoch": 0.017839934374527123, "grad_norm": 0.008834543637931347, "kl": 0.016428270121105017, "learning_rate": 6.738782355044048e-08, "loss": 0.0007, "reward": 0.80625, "reward_std": 0.21589626967906952, "rewards/custom_reward_simplified_v7_dblog": 0.80625, "step": 2240 }, { "completion_length": 634.0625, "epoch": 0.017919576938699118, "grad_norm": 0.286683052778244, "kl": 0.016679517249576746, "learning_rate": 5.92599822001666e-08, "loss": 0.0007, "reward": 0.853125, "reward_std": 0.2905955038964748, "rewards/custom_reward_simplified_v7_dblog": 0.853125, "step": 2250 }, { "completion_length": 610.85, "epoch": 0.017999219502871113, "grad_norm": 0.28028422594070435, "kl": 0.017966749798506498, "learning_rate": 5.164844558612131e-08, "loss": 0.0007, "reward": 0.971875, "reward_std": 0.3067967638373375, "rewards/custom_reward_simplified_v7_dblog": 0.971875, "step": 2260 }, { "completion_length": 566.9625, "epoch": 0.018078862067043112, "grad_norm": 0.3413483202457428, "kl": 0.01525729293935001, "learning_rate": 4.455482382342336e-08, "loss": 0.0006, "reward": 0.959375, "reward_std": 0.3084723956882954, "rewards/custom_reward_simplified_v7_dblog": 0.959375, "step": 2270 }, { "completion_length": 662.05625, "epoch": 0.018158504631215107, "grad_norm": 0.153013676404953, "kl": 0.017893880722112954, "learning_rate": 3.798061746947995e-08, "loss": 0.0007, "reward": 0.753125, "reward_std": 0.2308400221168995, "rewards/custom_reward_simplified_v7_dblog": 0.753125, "step": 2280 }, { "completion_length": 615.55, "epoch": 0.018238147195387102, "grad_norm": 0.2853679060935974, "kl": 0.0166370629100129, "learning_rate": 3.1927217206564884e-08, "loss": 0.0007, "reward": 0.74375, "reward_std": 0.25018117427825926, "rewards/custom_reward_simplified_v7_dblog": 0.74375, "step": 2290 }, { "completion_length": 710.45, "epoch": 0.018317789759559097, "grad_norm": 0.011245607398450375, "kl": 0.01835272475145757, "learning_rate": 2.6395903547638825e-08, "loss": 0.0007, "reward": 0.78125, "reward_std": 0.2881218962371349, "rewards/custom_reward_simplified_v7_dblog": 0.78125, "step": 2300 }, { "completion_length": 565.125, "epoch": 0.018397432323731096, "grad_norm": 0.25337040424346924, "kl": 0.01471406095661223, "learning_rate": 2.1387846565474047e-08, "loss": 0.0006, "reward": 1.078125, "reward_std": 0.4393742740154266, "rewards/custom_reward_simplified_v7_dblog": 1.078125, "step": 2310 }, { "completion_length": 692.70625, "epoch": 0.01847707488790309, "grad_norm": 0.20416221022605896, "kl": 0.02145941834896803, "learning_rate": 1.6904105645142443e-08, "loss": 0.0009, "reward": 0.5625, "reward_std": 0.1467035911977291, "rewards/custom_reward_simplified_v7_dblog": 0.5625, "step": 2320 } ], "logging_steps": 10, "max_steps": 2400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }