| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.01851689616998909, |
| "eval_steps": 500, |
| "global_step": 2325, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 714.1125, |
| "epoch": 7.964256417199609e-05, |
| "grad_norm": 0.11108597368001938, |
| "kl": 0.0005961419927189126, |
| "learning_rate": 2.0833333333333333e-07, |
| "loss": 0.0, |
| "reward": 0.540625, |
| "reward_std": 0.29713641852140427, |
| "rewards/custom_reward_simplified_v7_dblog": 0.540625, |
| "step": 10 |
| }, |
| { |
| "completion_length": 800.6625, |
| "epoch": 0.00015928512834399218, |
| "grad_norm": 0.1964382529258728, |
| "kl": 0.0007280149788130075, |
| "learning_rate": 4.1666666666666667e-07, |
| "loss": 0.0, |
| "reward": 0.496875, |
| "reward_std": 0.25719649270176886, |
| "rewards/custom_reward_simplified_v7_dblog": 0.496875, |
| "step": 20 |
| }, |
| { |
| "completion_length": 750.46875, |
| "epoch": 0.00023892769251598824, |
| "grad_norm": 0.15792745351791382, |
| "kl": 0.0007828957575839012, |
| "learning_rate": 6.25e-07, |
| "loss": 0.0, |
| "reward": 0.684375, |
| "reward_std": 0.3755971297621727, |
| "rewards/custom_reward_simplified_v7_dblog": 0.684375, |
| "step": 30 |
| }, |
| { |
| "completion_length": 813.94375, |
| "epoch": 0.00031857025668798435, |
| "grad_norm": 0.12503573298454285, |
| "kl": 0.0007155703555326909, |
| "learning_rate": 8.333333333333333e-07, |
| "loss": 0.0, |
| "reward": 0.565625, |
| "reward_std": 0.2761854581534863, |
| "rewards/custom_reward_simplified_v7_dblog": 0.565625, |
| "step": 40 |
| }, |
| { |
| "completion_length": 747.675, |
| "epoch": 0.0003982128208599804, |
| "grad_norm": 0.10329681634902954, |
| "kl": 0.0007686431898036971, |
| "learning_rate": 1.0416666666666667e-06, |
| "loss": 0.0, |
| "reward": 0.621875, |
| "reward_std": 0.30715219378471376, |
| "rewards/custom_reward_simplified_v7_dblog": 0.621875, |
| "step": 50 |
| }, |
| { |
| "completion_length": 821.60625, |
| "epoch": 0.0004778553850319765, |
| "grad_norm": 0.1834840029478073, |
| "kl": 0.0007538022648077459, |
| "learning_rate": 1.25e-06, |
| "loss": 0.0, |
| "reward": 0.578125, |
| "reward_std": 0.39505376294255257, |
| "rewards/custom_reward_simplified_v7_dblog": 0.578125, |
| "step": 60 |
| }, |
| { |
| "completion_length": 776.75, |
| "epoch": 0.0005574979492039726, |
| "grad_norm": 0.11483483016490936, |
| "kl": 0.0007510531373554841, |
| "learning_rate": 1.4583333333333335e-06, |
| "loss": 0.0, |
| "reward": 0.584375, |
| "reward_std": 0.32483330443501474, |
| "rewards/custom_reward_simplified_v7_dblog": 0.584375, |
| "step": 70 |
| }, |
| { |
| "completion_length": 804.675, |
| "epoch": 0.0006371405133759687, |
| "grad_norm": 0.17995329201221466, |
| "kl": 0.0007302156562218442, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 0.0, |
| "reward": 0.703125, |
| "reward_std": 0.32263160347938535, |
| "rewards/custom_reward_simplified_v7_dblog": 0.703125, |
| "step": 80 |
| }, |
| { |
| "completion_length": 793.0875, |
| "epoch": 0.0007167830775479647, |
| "grad_norm": 0.16513389348983765, |
| "kl": 0.0007239854254294187, |
| "learning_rate": 1.8750000000000003e-06, |
| "loss": 0.0, |
| "reward": 0.709375, |
| "reward_std": 0.3102527566254139, |
| "rewards/custom_reward_simplified_v7_dblog": 0.709375, |
| "step": 90 |
| }, |
| { |
| "completion_length": 812.0, |
| "epoch": 0.0007964256417199608, |
| "grad_norm": 0.1802467256784439, |
| "kl": 0.0007639184041181579, |
| "learning_rate": 2.0833333333333334e-06, |
| "loss": 0.0, |
| "reward": 0.528125, |
| "reward_std": 0.21242836564779283, |
| "rewards/custom_reward_simplified_v7_dblog": 0.528125, |
| "step": 100 |
| }, |
| { |
| "completion_length": 784.64375, |
| "epoch": 0.0008760682058919569, |
| "grad_norm": 0.17609436810016632, |
| "kl": 0.0007660316972760483, |
| "learning_rate": 2.2916666666666666e-06, |
| "loss": 0.0, |
| "reward": 0.565625, |
| "reward_std": 0.3309394560754299, |
| "rewards/custom_reward_simplified_v7_dblog": 0.565625, |
| "step": 110 |
| }, |
| { |
| "completion_length": 717.24375, |
| "epoch": 0.000955710770063953, |
| "grad_norm": 0.14550578594207764, |
| "kl": 0.0007782038446748629, |
| "learning_rate": 2.5e-06, |
| "loss": 0.0, |
| "reward": 0.728125, |
| "reward_std": 0.2573545627295971, |
| "rewards/custom_reward_simplified_v7_dblog": 0.728125, |
| "step": 120 |
| }, |
| { |
| "completion_length": 872.6375, |
| "epoch": 0.001035353334235949, |
| "grad_norm": 0.11807532608509064, |
| "kl": 0.0007370044564595446, |
| "learning_rate": 2.7083333333333334e-06, |
| "loss": 0.0, |
| "reward": 0.45, |
| "reward_std": 0.24368184804916382, |
| "rewards/custom_reward_simplified_v7_dblog": 0.45, |
| "step": 130 |
| }, |
| { |
| "completion_length": 780.325, |
| "epoch": 0.0011149958984079452, |
| "grad_norm": 0.21067936718463898, |
| "kl": 0.0007969280297402293, |
| "learning_rate": 2.916666666666667e-06, |
| "loss": 0.0, |
| "reward": 0.671875, |
| "reward_std": 0.3312204420566559, |
| "rewards/custom_reward_simplified_v7_dblog": 0.671875, |
| "step": 140 |
| }, |
| { |
| "completion_length": 796.15625, |
| "epoch": 0.0011946384625799412, |
| "grad_norm": 0.11178277432918549, |
| "kl": 0.0007584215141832829, |
| "learning_rate": 3.125e-06, |
| "loss": 0.0, |
| "reward": 0.675, |
| "reward_std": 0.2411833107471466, |
| "rewards/custom_reward_simplified_v7_dblog": 0.675, |
| "step": 150 |
| }, |
| { |
| "completion_length": 735.4375, |
| "epoch": 0.0012742810267519374, |
| "grad_norm": 0.12408847361803055, |
| "kl": 0.0008089728711638599, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 0.0, |
| "reward": 0.5875, |
| "reward_std": 0.2907834567129612, |
| "rewards/custom_reward_simplified_v7_dblog": 0.5875, |
| "step": 160 |
| }, |
| { |
| "completion_length": 630.76875, |
| "epoch": 0.0013539235909239334, |
| "grad_norm": 0.14481835067272186, |
| "kl": 0.0008351787488209084, |
| "learning_rate": 3.5416666666666673e-06, |
| "loss": 0.0, |
| "reward": 0.828125, |
| "reward_std": 0.3232325129210949, |
| "rewards/custom_reward_simplified_v7_dblog": 0.828125, |
| "step": 170 |
| }, |
| { |
| "completion_length": 704.2, |
| "epoch": 0.0014335661550959294, |
| "grad_norm": 0.22581899166107178, |
| "kl": 0.0008706353197339922, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 0.0, |
| "reward": 0.621875, |
| "reward_std": 0.2438264600932598, |
| "rewards/custom_reward_simplified_v7_dblog": 0.621875, |
| "step": 180 |
| }, |
| { |
| "completion_length": 738.2625, |
| "epoch": 0.0015132087192679256, |
| "grad_norm": 0.20901009440422058, |
| "kl": 0.000852665287675336, |
| "learning_rate": 3.958333333333333e-06, |
| "loss": 0.0, |
| "reward": 0.659375, |
| "reward_std": 0.2661551833152771, |
| "rewards/custom_reward_simplified_v7_dblog": 0.659375, |
| "step": 190 |
| }, |
| { |
| "completion_length": 773.31875, |
| "epoch": 0.0015928512834399217, |
| "grad_norm": 0.14023999869823456, |
| "kl": 0.0008427878346992657, |
| "learning_rate": 4.166666666666667e-06, |
| "loss": 0.0, |
| "reward": 0.575, |
| "reward_std": 0.263551290333271, |
| "rewards/custom_reward_simplified_v7_dblog": 0.575, |
| "step": 200 |
| }, |
| { |
| "completion_length": 760.4, |
| "epoch": 0.0016724938476119177, |
| "grad_norm": 0.15415024757385254, |
| "kl": 0.0009272771596442908, |
| "learning_rate": 4.3750000000000005e-06, |
| "loss": 0.0, |
| "reward": 0.578125, |
| "reward_std": 0.3055797599256039, |
| "rewards/custom_reward_simplified_v7_dblog": 0.578125, |
| "step": 210 |
| }, |
| { |
| "completion_length": 824.94375, |
| "epoch": 0.0017521364117839139, |
| "grad_norm": 0.18523605167865753, |
| "kl": 0.0008898543601389974, |
| "learning_rate": 4.583333333333333e-06, |
| "loss": 0.0, |
| "reward": 0.58125, |
| "reward_std": 0.2951655209064484, |
| "rewards/custom_reward_simplified_v7_dblog": 0.58125, |
| "step": 220 |
| }, |
| { |
| "completion_length": 748.5375, |
| "epoch": 0.0018317789759559099, |
| "grad_norm": 0.11306847631931305, |
| "kl": 0.0009787698683794588, |
| "learning_rate": 4.791666666666668e-06, |
| "loss": 0.0, |
| "reward": 0.590625, |
| "reward_std": 0.2887454777956009, |
| "rewards/custom_reward_simplified_v7_dblog": 0.590625, |
| "step": 230 |
| }, |
| { |
| "completion_length": 753.39375, |
| "epoch": 0.001911421540127906, |
| "grad_norm": 0.0014718669699504972, |
| "kl": 0.0010118414385942743, |
| "learning_rate": 5e-06, |
| "loss": 0.0, |
| "reward": 0.55625, |
| "reward_std": 0.1977315753698349, |
| "rewards/custom_reward_simplified_v7_dblog": 0.55625, |
| "step": 240 |
| }, |
| { |
| "completion_length": 812.71875, |
| "epoch": 0.001991064104299902, |
| "grad_norm": 0.11223085969686508, |
| "kl": 0.0010390775743871928, |
| "learning_rate": 4.999735579817769e-06, |
| "loss": 0.0, |
| "reward": 0.6875, |
| "reward_std": 0.24264758601784706, |
| "rewards/custom_reward_simplified_v7_dblog": 0.6875, |
| "step": 250 |
| }, |
| { |
| "completion_length": 731.66875, |
| "epoch": 0.002070706668471898, |
| "grad_norm": 0.1944543570280075, |
| "kl": 0.001084678602637723, |
| "learning_rate": 4.998942375205502e-06, |
| "loss": 0.0, |
| "reward": 0.796875, |
| "reward_std": 0.31279500126838683, |
| "rewards/custom_reward_simplified_v7_dblog": 0.796875, |
| "step": 260 |
| }, |
| { |
| "completion_length": 770.69375, |
| "epoch": 0.0021503492326438944, |
| "grad_norm": 0.10904921591281891, |
| "kl": 0.0012701354396995157, |
| "learning_rate": 4.997620553954645e-06, |
| "loss": 0.0001, |
| "reward": 0.653125, |
| "reward_std": 0.1583670809864998, |
| "rewards/custom_reward_simplified_v7_dblog": 0.653125, |
| "step": 270 |
| }, |
| { |
| "completion_length": 646.63125, |
| "epoch": 0.0022299917968158904, |
| "grad_norm": 0.11776451766490936, |
| "kl": 0.0026803009008290247, |
| "learning_rate": 4.995770395678171e-06, |
| "loss": 0.0001, |
| "reward": 0.78125, |
| "reward_std": 0.37105962783098223, |
| "rewards/custom_reward_simplified_v7_dblog": 0.78125, |
| "step": 280 |
| }, |
| { |
| "completion_length": 850.5125, |
| "epoch": 0.0023096343609878864, |
| "grad_norm": 0.17029190063476562, |
| "kl": 0.0011812534503405914, |
| "learning_rate": 4.993392291751431e-06, |
| "loss": 0.0, |
| "reward": 0.503125, |
| "reward_std": 0.2320079453289509, |
| "rewards/custom_reward_simplified_v7_dblog": 0.503125, |
| "step": 290 |
| }, |
| { |
| "completion_length": 774.4125, |
| "epoch": 0.0023892769251598824, |
| "grad_norm": 0.17417992651462555, |
| "kl": 0.001456298804259859, |
| "learning_rate": 4.990486745229364e-06, |
| "loss": 0.0001, |
| "reward": 0.621875, |
| "reward_std": 0.24568462520837783, |
| "rewards/custom_reward_simplified_v7_dblog": 0.621875, |
| "step": 300 |
| }, |
| { |
| "completion_length": 733.675, |
| "epoch": 0.0024689194893318784, |
| "grad_norm": 0.13222694396972656, |
| "kl": 0.001577114372048527, |
| "learning_rate": 4.9870543707400835e-06, |
| "loss": 0.0001, |
| "reward": 0.653125, |
| "reward_std": 0.27914761677384375, |
| "rewards/custom_reward_simplified_v7_dblog": 0.653125, |
| "step": 310 |
| }, |
| { |
| "completion_length": 711.91875, |
| "epoch": 0.002548562053503875, |
| "grad_norm": 0.19241130352020264, |
| "kl": 0.0017230566183570773, |
| "learning_rate": 4.983095894354858e-06, |
| "loss": 0.0001, |
| "reward": 0.68125, |
| "reward_std": 0.3178554192185402, |
| "rewards/custom_reward_simplified_v7_dblog": 0.68125, |
| "step": 320 |
| }, |
| { |
| "completion_length": 780.34375, |
| "epoch": 0.002628204617675871, |
| "grad_norm": 0.1997414082288742, |
| "kl": 0.002029248425969854, |
| "learning_rate": 4.978612153434527e-06, |
| "loss": 0.0001, |
| "reward": 0.696875, |
| "reward_std": 0.32896072417497635, |
| "rewards/custom_reward_simplified_v7_dblog": 0.696875, |
| "step": 330 |
| }, |
| { |
| "completion_length": 695.63125, |
| "epoch": 0.002707847181847867, |
| "grad_norm": 0.18966233730316162, |
| "kl": 0.002277573832543567, |
| "learning_rate": 4.973604096452361e-06, |
| "loss": 0.0001, |
| "reward": 0.684375, |
| "reward_std": 0.2995404839515686, |
| "rewards/custom_reward_simplified_v7_dblog": 0.684375, |
| "step": 340 |
| }, |
| { |
| "completion_length": 719.425, |
| "epoch": 0.002787489746019863, |
| "grad_norm": 0.17769980430603027, |
| "kl": 0.002305405435618013, |
| "learning_rate": 4.968072782793436e-06, |
| "loss": 0.0001, |
| "reward": 0.74375, |
| "reward_std": 0.3807508498430252, |
| "rewards/custom_reward_simplified_v7_dblog": 0.74375, |
| "step": 350 |
| }, |
| { |
| "completion_length": 732.4375, |
| "epoch": 0.002867132310191859, |
| "grad_norm": 0.21898534893989563, |
| "kl": 0.002607938600704074, |
| "learning_rate": 4.962019382530521e-06, |
| "loss": 0.0001, |
| "reward": 0.596875, |
| "reward_std": 0.303117785602808, |
| "rewards/custom_reward_simplified_v7_dblog": 0.596875, |
| "step": 360 |
| }, |
| { |
| "completion_length": 703.21875, |
| "epoch": 0.002946774874363855, |
| "grad_norm": 0.20463985204696655, |
| "kl": 0.0030091375578194858, |
| "learning_rate": 4.955445176176577e-06, |
| "loss": 0.0001, |
| "reward": 0.746875, |
| "reward_std": 0.28880608528852464, |
| "rewards/custom_reward_simplified_v7_dblog": 0.746875, |
| "step": 370 |
| }, |
| { |
| "completion_length": 646.95625, |
| "epoch": 0.0030264174385358513, |
| "grad_norm": 0.17787523567676544, |
| "kl": 0.003602780296932906, |
| "learning_rate": 4.948351554413879e-06, |
| "loss": 0.0001, |
| "reward": 0.753125, |
| "reward_std": 0.31493050456047056, |
| "rewards/custom_reward_simplified_v7_dblog": 0.753125, |
| "step": 380 |
| }, |
| { |
| "completion_length": 656.94375, |
| "epoch": 0.0031060600027078473, |
| "grad_norm": 0.18550129234790802, |
| "kl": 0.003282526368275285, |
| "learning_rate": 4.9407400177998335e-06, |
| "loss": 0.0001, |
| "reward": 0.828125, |
| "reward_std": 0.33323406875133516, |
| "rewards/custom_reward_simplified_v7_dblog": 0.828125, |
| "step": 390 |
| }, |
| { |
| "completion_length": 740.3125, |
| "epoch": 0.0031857025668798433, |
| "grad_norm": 0.19987954199314117, |
| "kl": 0.003102585405576974, |
| "learning_rate": 4.93261217644956e-06, |
| "loss": 0.0001, |
| "reward": 0.590625, |
| "reward_std": 0.26303397938609124, |
| "rewards/custom_reward_simplified_v7_dblog": 0.590625, |
| "step": 400 |
| }, |
| { |
| "completion_length": 641.26875, |
| "epoch": 0.0032653451310518393, |
| "grad_norm": 0.21161562204360962, |
| "kl": 0.003351045388262719, |
| "learning_rate": 4.9239697496952904e-06, |
| "loss": 0.0001, |
| "reward": 0.909375, |
| "reward_std": 0.3579762116074562, |
| "rewards/custom_reward_simplified_v7_dblog": 0.909375, |
| "step": 410 |
| }, |
| { |
| "completion_length": 692.06875, |
| "epoch": 0.0033449876952238353, |
| "grad_norm": 0.17584940791130066, |
| "kl": 0.003339459316339344, |
| "learning_rate": 4.914814565722671e-06, |
| "loss": 0.0001, |
| "reward": 0.765625, |
| "reward_std": 0.3109076008200645, |
| "rewards/custom_reward_simplified_v7_dblog": 0.765625, |
| "step": 420 |
| }, |
| { |
| "completion_length": 638.51875, |
| "epoch": 0.0034246302593958313, |
| "grad_norm": 0.17778904736042023, |
| "kl": 0.0034626491484232246, |
| "learning_rate": 4.905148561184033e-06, |
| "loss": 0.0001, |
| "reward": 0.671875, |
| "reward_std": 0.2665500298142433, |
| "rewards/custom_reward_simplified_v7_dblog": 0.671875, |
| "step": 430 |
| }, |
| { |
| "completion_length": 597.475, |
| "epoch": 0.0035042728235678278, |
| "grad_norm": 0.127123162150383, |
| "kl": 0.0039646215736866, |
| "learning_rate": 4.894973780788722e-06, |
| "loss": 0.0002, |
| "reward": 0.88125, |
| "reward_std": 0.28942874893546106, |
| "rewards/custom_reward_simplified_v7_dblog": 0.88125, |
| "step": 440 |
| }, |
| { |
| "completion_length": 651.6625, |
| "epoch": 0.0035839153877398238, |
| "grad_norm": 0.21087874472141266, |
| "kl": 0.004210945626255125, |
| "learning_rate": 4.884292376870567e-06, |
| "loss": 0.0002, |
| "reward": 0.753125, |
| "reward_std": 0.29777742698788645, |
| "rewards/custom_reward_simplified_v7_dblog": 0.753125, |
| "step": 450 |
| }, |
| { |
| "completion_length": 727.13125, |
| "epoch": 0.0036635579519118198, |
| "grad_norm": 0.18630079925060272, |
| "kl": 0.003935616160742938, |
| "learning_rate": 4.873106608932585e-06, |
| "loss": 0.0002, |
| "reward": 0.678125, |
| "reward_std": 0.31932896226644514, |
| "rewards/custom_reward_simplified_v7_dblog": 0.678125, |
| "step": 460 |
| }, |
| { |
| "completion_length": 716.74375, |
| "epoch": 0.003743200516083816, |
| "grad_norm": 0.1637570858001709, |
| "kl": 0.004373999196104705, |
| "learning_rate": 4.861418843169012e-06, |
| "loss": 0.0002, |
| "reward": 0.646875, |
| "reward_std": 0.26624983847141265, |
| "rewards/custom_reward_simplified_v7_dblog": 0.646875, |
| "step": 470 |
| }, |
| { |
| "completion_length": 581.90625, |
| "epoch": 0.003822843080255812, |
| "grad_norm": 0.0051241409964859486, |
| "kl": 0.004909415659494698, |
| "learning_rate": 4.849231551964771e-06, |
| "loss": 0.0002, |
| "reward": 0.75625, |
| "reward_std": 0.19474873542785645, |
| "rewards/custom_reward_simplified_v7_dblog": 0.75625, |
| "step": 480 |
| }, |
| { |
| "completion_length": 680.94375, |
| "epoch": 0.003902485644427808, |
| "grad_norm": 0.15670013427734375, |
| "kl": 0.004694941581692547, |
| "learning_rate": 4.836547313372472e-06, |
| "loss": 0.0002, |
| "reward": 0.73125, |
| "reward_std": 0.2675834000110626, |
| "rewards/custom_reward_simplified_v7_dblog": 0.73125, |
| "step": 490 |
| }, |
| { |
| "completion_length": 699.1, |
| "epoch": 0.003982128208599804, |
| "grad_norm": 0.1365301012992859, |
| "kl": 0.00405421577161178, |
| "learning_rate": 4.823368810567056e-06, |
| "loss": 0.0002, |
| "reward": 0.603125, |
| "reward_std": 0.25718758851289747, |
| "rewards/custom_reward_simplified_v7_dblog": 0.603125, |
| "step": 500 |
| }, |
| { |
| "completion_length": 646.7, |
| "epoch": 0.0040617707727718, |
| "grad_norm": 0.14925876259803772, |
| "kl": 0.003934591950383037, |
| "learning_rate": 4.809698831278217e-06, |
| "loss": 0.0002, |
| "reward": 0.734375, |
| "reward_std": 0.2696119427680969, |
| "rewards/custom_reward_simplified_v7_dblog": 0.734375, |
| "step": 510 |
| }, |
| { |
| "completion_length": 726.08125, |
| "epoch": 0.004141413336943796, |
| "grad_norm": 0.2107785940170288, |
| "kl": 0.004233359964564443, |
| "learning_rate": 4.7955402672006855e-06, |
| "loss": 0.0002, |
| "reward": 0.759375, |
| "reward_std": 0.2953102938830853, |
| "rewards/custom_reward_simplified_v7_dblog": 0.759375, |
| "step": 520 |
| }, |
| { |
| "completion_length": 633.525, |
| "epoch": 0.004221055901115793, |
| "grad_norm": 0.2159271538257599, |
| "kl": 0.004929024970624596, |
| "learning_rate": 4.780896113382536e-06, |
| "loss": 0.0002, |
| "reward": 0.75625, |
| "reward_std": 0.2647860750555992, |
| "rewards/custom_reward_simplified_v7_dblog": 0.75625, |
| "step": 530 |
| }, |
| { |
| "completion_length": 586.9125, |
| "epoch": 0.004300698465287789, |
| "grad_norm": 0.2394983470439911, |
| "kl": 0.004724201350472868, |
| "learning_rate": 4.765769467591626e-06, |
| "loss": 0.0002, |
| "reward": 0.975, |
| "reward_std": 0.36022927314043046, |
| "rewards/custom_reward_simplified_v7_dblog": 0.975, |
| "step": 540 |
| }, |
| { |
| "completion_length": 651.88125, |
| "epoch": 0.004380341029459785, |
| "grad_norm": 0.1552504301071167, |
| "kl": 0.004269527771975845, |
| "learning_rate": 4.750163529660303e-06, |
| "loss": 0.0002, |
| "reward": 0.790625, |
| "reward_std": 0.2759058982133865, |
| "rewards/custom_reward_simplified_v7_dblog": 0.790625, |
| "step": 550 |
| }, |
| { |
| "completion_length": 655.9125, |
| "epoch": 0.004459983593631781, |
| "grad_norm": 0.13005749881267548, |
| "kl": 0.004541868972592056, |
| "learning_rate": 4.734081600808531e-06, |
| "loss": 0.0002, |
| "reward": 0.796875, |
| "reward_std": 0.2369130529463291, |
| "rewards/custom_reward_simplified_v7_dblog": 0.796875, |
| "step": 560 |
| }, |
| { |
| "completion_length": 630.3375, |
| "epoch": 0.004539626157803777, |
| "grad_norm": 0.14732114970684052, |
| "kl": 0.004577037692070007, |
| "learning_rate": 4.717527082945555e-06, |
| "loss": 0.0002, |
| "reward": 0.925, |
| "reward_std": 0.3310479797422886, |
| "rewards/custom_reward_simplified_v7_dblog": 0.925, |
| "step": 570 |
| }, |
| { |
| "completion_length": 693.2625, |
| "epoch": 0.004619268721975773, |
| "grad_norm": 0.11388376355171204, |
| "kl": 0.004154781624674797, |
| "learning_rate": 4.700503477950278e-06, |
| "loss": 0.0002, |
| "reward": 0.6875, |
| "reward_std": 0.29332098439335824, |
| "rewards/custom_reward_simplified_v7_dblog": 0.6875, |
| "step": 580 |
| }, |
| { |
| "completion_length": 662.7625, |
| "epoch": 0.004698911286147769, |
| "grad_norm": 0.15470421314239502, |
| "kl": 0.00541011628229171, |
| "learning_rate": 4.6830143869304904e-06, |
| "loss": 0.0002, |
| "reward": 0.809375, |
| "reward_std": 0.32753978818655016, |
| "rewards/custom_reward_simplified_v7_dblog": 0.809375, |
| "step": 590 |
| }, |
| { |
| "completion_length": 698.95625, |
| "epoch": 0.004778553850319765, |
| "grad_norm": 0.004228990990668535, |
| "kl": 0.004637495230417699, |
| "learning_rate": 4.665063509461098e-06, |
| "loss": 0.0002, |
| "reward": 0.75, |
| "reward_std": 0.23772156983613968, |
| "rewards/custom_reward_simplified_v7_dblog": 0.75, |
| "step": 600 |
| }, |
| { |
| "completion_length": 629.7625, |
| "epoch": 0.004858196414491761, |
| "grad_norm": 0.21860064566135406, |
| "kl": 0.0044788535917177795, |
| "learning_rate": 4.646654642801533e-06, |
| "loss": 0.0002, |
| "reward": 0.8125, |
| "reward_std": 0.27716630697250366, |
| "rewards/custom_reward_simplified_v7_dblog": 0.8125, |
| "step": 610 |
| }, |
| { |
| "completion_length": 727.06875, |
| "epoch": 0.004937838978663757, |
| "grad_norm": 0.1765265315771103, |
| "kl": 0.004957099666353315, |
| "learning_rate": 4.627791681092499e-06, |
| "loss": 0.0002, |
| "reward": 0.6, |
| "reward_std": 0.2689620770514011, |
| "rewards/custom_reward_simplified_v7_dblog": 0.6, |
| "step": 620 |
| }, |
| { |
| "completion_length": 718.825, |
| "epoch": 0.005017481542835753, |
| "grad_norm": 0.12771090865135193, |
| "kl": 0.005165508517529815, |
| "learning_rate": 4.608478614532215e-06, |
| "loss": 0.0002, |
| "reward": 0.728125, |
| "reward_std": 0.3053886480629444, |
| "rewards/custom_reward_simplified_v7_dblog": 0.728125, |
| "step": 630 |
| }, |
| { |
| "completion_length": 629.525, |
| "epoch": 0.00509712410700775, |
| "grad_norm": 0.17840693891048431, |
| "kl": 0.005059469246771186, |
| "learning_rate": 4.588719528532342e-06, |
| "loss": 0.0002, |
| "reward": 0.721875, |
| "reward_std": 0.298052953928709, |
| "rewards/custom_reward_simplified_v7_dblog": 0.721875, |
| "step": 640 |
| }, |
| { |
| "completion_length": 668.68125, |
| "epoch": 0.005176766671179746, |
| "grad_norm": 0.12746350467205048, |
| "kl": 0.004331990797072649, |
| "learning_rate": 4.568518602853776e-06, |
| "loss": 0.0002, |
| "reward": 0.746875, |
| "reward_std": 0.22913563549518584, |
| "rewards/custom_reward_simplified_v7_dblog": 0.746875, |
| "step": 650 |
| }, |
| { |
| "completion_length": 734.9875, |
| "epoch": 0.005256409235351742, |
| "grad_norm": 0.19717195630073547, |
| "kl": 0.00479215239174664, |
| "learning_rate": 4.54788011072248e-06, |
| "loss": 0.0002, |
| "reward": 0.784375, |
| "reward_std": 0.4230809181928635, |
| "rewards/custom_reward_simplified_v7_dblog": 0.784375, |
| "step": 660 |
| }, |
| { |
| "completion_length": 658.29375, |
| "epoch": 0.005336051799523738, |
| "grad_norm": 0.2698514759540558, |
| "kl": 0.004821322776842862, |
| "learning_rate": 4.526808417925531e-06, |
| "loss": 0.0002, |
| "reward": 0.81875, |
| "reward_std": 0.26030006259679794, |
| "rewards/custom_reward_simplified_v7_dblog": 0.81875, |
| "step": 670 |
| }, |
| { |
| "completion_length": 696.30625, |
| "epoch": 0.005415694363695734, |
| "grad_norm": 0.2144252061843872, |
| "kl": 0.005292760988231749, |
| "learning_rate": 4.50530798188761e-06, |
| "loss": 0.0002, |
| "reward": 0.609375, |
| "reward_std": 0.2595392823219299, |
| "rewards/custom_reward_simplified_v7_dblog": 0.609375, |
| "step": 680 |
| }, |
| { |
| "completion_length": 696.99375, |
| "epoch": 0.00549533692786773, |
| "grad_norm": 0.006262101698666811, |
| "kl": 0.005413674132432789, |
| "learning_rate": 4.4833833507280884e-06, |
| "loss": 0.0002, |
| "reward": 0.684375, |
| "reward_std": 0.24843912497162818, |
| "rewards/custom_reward_simplified_v7_dblog": 0.684375, |
| "step": 690 |
| }, |
| { |
| "completion_length": 675.50625, |
| "epoch": 0.005574979492039726, |
| "grad_norm": 0.16301825642585754, |
| "kl": 0.005892223375849426, |
| "learning_rate": 4.46103916229894e-06, |
| "loss": 0.0002, |
| "reward": 0.80625, |
| "reward_std": 0.34091843143105505, |
| "rewards/custom_reward_simplified_v7_dblog": 0.80625, |
| "step": 700 |
| }, |
| { |
| "completion_length": 725.675, |
| "epoch": 0.005654622056211722, |
| "grad_norm": 0.18473494052886963, |
| "kl": 0.005652935197576881, |
| "learning_rate": 4.438280143203665e-06, |
| "loss": 0.0002, |
| "reward": 0.66875, |
| "reward_std": 0.216452856361866, |
| "rewards/custom_reward_simplified_v7_dblog": 0.66875, |
| "step": 710 |
| }, |
| { |
| "completion_length": 764.26875, |
| "epoch": 0.005734264620383718, |
| "grad_norm": 0.17735017836093903, |
| "kl": 0.005824547982774675, |
| "learning_rate": 4.415111107797445e-06, |
| "loss": 0.0002, |
| "reward": 0.634375, |
| "reward_std": 0.25477964654564855, |
| "rewards/custom_reward_simplified_v7_dblog": 0.634375, |
| "step": 720 |
| }, |
| { |
| "completion_length": 607.86875, |
| "epoch": 0.005813907184555714, |
| "grad_norm": 0.20680995285511017, |
| "kl": 0.0055589195340871814, |
| "learning_rate": 4.391536957168733e-06, |
| "loss": 0.0002, |
| "reward": 0.8, |
| "reward_std": 0.32480863481760025, |
| "rewards/custom_reward_simplified_v7_dblog": 0.8, |
| "step": 730 |
| }, |
| { |
| "completion_length": 674.13125, |
| "epoch": 0.00589354974872771, |
| "grad_norm": 0.005594769027084112, |
| "kl": 0.005972519854549318, |
| "learning_rate": 4.367562678102491e-06, |
| "loss": 0.0002, |
| "reward": 0.665625, |
| "reward_std": 0.20820673778653145, |
| "rewards/custom_reward_simplified_v7_dblog": 0.665625, |
| "step": 740 |
| }, |
| { |
| "completion_length": 639.69375, |
| "epoch": 0.005973192312899706, |
| "grad_norm": 0.11012833565473557, |
| "kl": 0.005814655229914934, |
| "learning_rate": 4.34319334202531e-06, |
| "loss": 0.0002, |
| "reward": 0.796875, |
| "reward_std": 0.34761993661522866, |
| "rewards/custom_reward_simplified_v7_dblog": 0.796875, |
| "step": 750 |
| }, |
| { |
| "completion_length": 587.6, |
| "epoch": 0.006052834877071703, |
| "grad_norm": 0.2750849723815918, |
| "kl": 0.006217251974157989, |
| "learning_rate": 4.318434103932622e-06, |
| "loss": 0.0002, |
| "reward": 0.75625, |
| "reward_std": 0.23903784826397895, |
| "rewards/custom_reward_simplified_v7_dblog": 0.75625, |
| "step": 760 |
| }, |
| { |
| "completion_length": 691.45625, |
| "epoch": 0.006132477441243699, |
| "grad_norm": 0.12792551517486572, |
| "kl": 0.005762395297642798, |
| "learning_rate": 4.293290201298224e-06, |
| "loss": 0.0002, |
| "reward": 0.65, |
| "reward_std": 0.282283828407526, |
| "rewards/custom_reward_simplified_v7_dblog": 0.65, |
| "step": 770 |
| }, |
| { |
| "completion_length": 634.79375, |
| "epoch": 0.006212120005415695, |
| "grad_norm": 0.11762549728155136, |
| "kl": 0.005472023575566709, |
| "learning_rate": 4.267766952966369e-06, |
| "loss": 0.0002, |
| "reward": 0.878125, |
| "reward_std": 0.31506996527314185, |
| "rewards/custom_reward_simplified_v7_dblog": 0.878125, |
| "step": 780 |
| }, |
| { |
| "completion_length": 719.05625, |
| "epoch": 0.006291762569587691, |
| "grad_norm": 0.0052847606129944324, |
| "kl": 0.006504135020077228, |
| "learning_rate": 4.241869758026638e-06, |
| "loss": 0.0003, |
| "reward": 0.628125, |
| "reward_std": 0.2685270056128502, |
| "rewards/custom_reward_simplified_v7_dblog": 0.628125, |
| "step": 790 |
| }, |
| { |
| "completion_length": 699.19375, |
| "epoch": 0.006371405133759687, |
| "grad_norm": 0.2003583461046219, |
| "kl": 0.005931918846908957, |
| "learning_rate": 4.215604094671835e-06, |
| "loss": 0.0002, |
| "reward": 0.746875, |
| "reward_std": 0.25832219421863556, |
| "rewards/custom_reward_simplified_v7_dblog": 0.746875, |
| "step": 800 |
| }, |
| { |
| "completion_length": 652.925, |
| "epoch": 0.006451047697931683, |
| "grad_norm": 0.0062674470245838165, |
| "kl": 0.006221415114123374, |
| "learning_rate": 4.188975519039151e-06, |
| "loss": 0.0002, |
| "reward": 0.73125, |
| "reward_std": 0.3172403134405613, |
| "rewards/custom_reward_simplified_v7_dblog": 0.73125, |
| "step": 810 |
| }, |
| { |
| "completion_length": 668.63125, |
| "epoch": 0.006530690262103679, |
| "grad_norm": 0.13624051213264465, |
| "kl": 0.0063671735813841225, |
| "learning_rate": 4.161989664034844e-06, |
| "loss": 0.0003, |
| "reward": 0.684375, |
| "reward_std": 0.24903304055333136, |
| "rewards/custom_reward_simplified_v7_dblog": 0.684375, |
| "step": 820 |
| }, |
| { |
| "completion_length": 658.575, |
| "epoch": 0.006610332826275675, |
| "grad_norm": 0.2923766076564789, |
| "kl": 0.0068331335205584764, |
| "learning_rate": 4.134652238142674e-06, |
| "loss": 0.0003, |
| "reward": 0.73125, |
| "reward_std": 0.3243869088590145, |
| "rewards/custom_reward_simplified_v7_dblog": 0.73125, |
| "step": 830 |
| }, |
| { |
| "completion_length": 645.31875, |
| "epoch": 0.006689975390447671, |
| "grad_norm": 0.22414511442184448, |
| "kl": 0.006329123536124826, |
| "learning_rate": 4.106969024216348e-06, |
| "loss": 0.0003, |
| "reward": 0.728125, |
| "reward_std": 0.2578707054257393, |
| "rewards/custom_reward_simplified_v7_dblog": 0.728125, |
| "step": 840 |
| }, |
| { |
| "completion_length": 620.76875, |
| "epoch": 0.006769617954619667, |
| "grad_norm": 0.2500353455543518, |
| "kl": 0.006427089823409915, |
| "learning_rate": 4.078945878256244e-06, |
| "loss": 0.0003, |
| "reward": 0.85625, |
| "reward_std": 0.3704014003276825, |
| "rewards/custom_reward_simplified_v7_dblog": 0.85625, |
| "step": 850 |
| }, |
| { |
| "completion_length": 545.075, |
| "epoch": 0.006849260518791663, |
| "grad_norm": 0.18576188385486603, |
| "kl": 0.005737546656746417, |
| "learning_rate": 4.0505887281706505e-06, |
| "loss": 0.0002, |
| "reward": 0.9125, |
| "reward_std": 0.27787805944681165, |
| "rewards/custom_reward_simplified_v7_dblog": 0.9125, |
| "step": 860 |
| }, |
| { |
| "completion_length": 671.1375, |
| "epoch": 0.0069289030829636595, |
| "grad_norm": 0.27761420607566833, |
| "kl": 0.005926149617880583, |
| "learning_rate": 4.021903572521802e-06, |
| "loss": 0.0002, |
| "reward": 0.71875, |
| "reward_std": 0.1984293892979622, |
| "rewards/custom_reward_simplified_v7_dblog": 0.71875, |
| "step": 870 |
| }, |
| { |
| "completion_length": 591.325, |
| "epoch": 0.0070085456471356555, |
| "grad_norm": 0.12898898124694824, |
| "kl": 0.006013317289762199, |
| "learning_rate": 3.992896479256966e-06, |
| "loss": 0.0002, |
| "reward": 0.875, |
| "reward_std": 0.31373453289270403, |
| "rewards/custom_reward_simplified_v7_dblog": 0.875, |
| "step": 880 |
| }, |
| { |
| "completion_length": 709.64375, |
| "epoch": 0.0070881882113076515, |
| "grad_norm": 0.1858564019203186, |
| "kl": 0.006654553860425949, |
| "learning_rate": 3.963573584424852e-06, |
| "loss": 0.0003, |
| "reward": 0.875, |
| "reward_std": 0.40053595080971716, |
| "rewards/custom_reward_simplified_v7_dblog": 0.875, |
| "step": 890 |
| }, |
| { |
| "completion_length": 693.86875, |
| "epoch": 0.0071678307754796475, |
| "grad_norm": 0.23618744313716888, |
| "kl": 0.006588698271661997, |
| "learning_rate": 3.933941090877615e-06, |
| "loss": 0.0003, |
| "reward": 0.6875, |
| "reward_std": 0.22922600656747819, |
| "rewards/custom_reward_simplified_v7_dblog": 0.6875, |
| "step": 900 |
| }, |
| { |
| "completion_length": 655.1, |
| "epoch": 0.0072474733396516436, |
| "grad_norm": 0.18607589602470398, |
| "kl": 0.006554636568762362, |
| "learning_rate": 3.9040052669587325e-06, |
| "loss": 0.0003, |
| "reward": 0.79375, |
| "reward_std": 0.26788339093327523, |
| "rewards/custom_reward_simplified_v7_dblog": 0.79375, |
| "step": 910 |
| }, |
| { |
| "completion_length": 678.36875, |
| "epoch": 0.0073271159038236396, |
| "grad_norm": 0.15605397522449493, |
| "kl": 0.006827571708709001, |
| "learning_rate": 3.8737724451770155e-06, |
| "loss": 0.0003, |
| "reward": 0.74375, |
| "reward_std": 0.25242582634091376, |
| "rewards/custom_reward_simplified_v7_dblog": 0.74375, |
| "step": 920 |
| }, |
| { |
| "completion_length": 640.1875, |
| "epoch": 0.0074067584679956356, |
| "grad_norm": 0.22241215407848358, |
| "kl": 0.006700195767916739, |
| "learning_rate": 3.8432490208670605e-06, |
| "loss": 0.0003, |
| "reward": 0.753125, |
| "reward_std": 0.30004683434963225, |
| "rewards/custom_reward_simplified_v7_dblog": 0.753125, |
| "step": 930 |
| }, |
| { |
| "completion_length": 671.025, |
| "epoch": 0.007486401032167632, |
| "grad_norm": 0.2610742747783661, |
| "kl": 0.007203501905314625, |
| "learning_rate": 3.8124414508364005e-06, |
| "loss": 0.0003, |
| "reward": 0.696875, |
| "reward_std": 0.2809624969959259, |
| "rewards/custom_reward_simplified_v7_dblog": 0.696875, |
| "step": 940 |
| }, |
| { |
| "completion_length": 644.56875, |
| "epoch": 0.007566043596339628, |
| "grad_norm": 0.18431080877780914, |
| "kl": 0.006376700336113572, |
| "learning_rate": 3.7813562519996633e-06, |
| "loss": 0.0003, |
| "reward": 0.775, |
| "reward_std": 0.2690692335367203, |
| "rewards/custom_reward_simplified_v7_dblog": 0.775, |
| "step": 950 |
| }, |
| { |
| "completion_length": 706.7125, |
| "epoch": 0.007645686160511624, |
| "grad_norm": 0.11362796276807785, |
| "kl": 0.0065676989033818245, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 0.0003, |
| "reward": 0.753125, |
| "reward_std": 0.3238763153553009, |
| "rewards/custom_reward_simplified_v7_dblog": 0.753125, |
| "step": 960 |
| }, |
| { |
| "completion_length": 591.60625, |
| "epoch": 0.00772532872468362, |
| "grad_norm": 0.006601857952773571, |
| "kl": 0.0061999865574762225, |
| "learning_rate": 3.7183793278181063e-06, |
| "loss": 0.0002, |
| "reward": 0.978125, |
| "reward_std": 0.32862835973501203, |
| "rewards/custom_reward_simplified_v7_dblog": 0.978125, |
| "step": 970 |
| }, |
| { |
| "completion_length": 623.40625, |
| "epoch": 0.007804971288855616, |
| "grad_norm": 0.24265889823436737, |
| "kl": 0.006443582929205149, |
| "learning_rate": 3.6865009243691015e-06, |
| "loss": 0.0003, |
| "reward": 0.790625, |
| "reward_std": 0.35499989837408064, |
| "rewards/custom_reward_simplified_v7_dblog": 0.790625, |
| "step": 980 |
| }, |
| { |
| "completion_length": 677.65625, |
| "epoch": 0.007884613853027612, |
| "grad_norm": 0.23094038665294647, |
| "kl": 0.006802499154582619, |
| "learning_rate": 3.654371533087586e-06, |
| "loss": 0.0003, |
| "reward": 0.80625, |
| "reward_std": 0.3126889310777187, |
| "rewards/custom_reward_simplified_v7_dblog": 0.80625, |
| "step": 990 |
| }, |
| { |
| "completion_length": 703.0125, |
| "epoch": 0.007964256417199608, |
| "grad_norm": 0.2269383817911148, |
| "kl": 0.006587388808839023, |
| "learning_rate": 3.621997950501156e-06, |
| "loss": 0.0003, |
| "reward": 0.83125, |
| "reward_std": 0.3684743233025074, |
| "rewards/custom_reward_simplified_v7_dblog": 0.83125, |
| "step": 1000 |
| }, |
| { |
| "completion_length": 702.75, |
| "epoch": 0.008043898981371604, |
| "grad_norm": 0.25571930408477783, |
| "kl": 0.0066094894893467425, |
| "learning_rate": 3.5893870247926986e-06, |
| "loss": 0.0003, |
| "reward": 0.690625, |
| "reward_std": 0.27608626931905744, |
| "rewards/custom_reward_simplified_v7_dblog": 0.690625, |
| "step": 1010 |
| }, |
| { |
| "completion_length": 634.18125, |
| "epoch": 0.0081235415455436, |
| "grad_norm": 0.006109423469752073, |
| "kl": 0.006831615581177175, |
| "learning_rate": 3.556545654351749e-06, |
| "loss": 0.0003, |
| "reward": 0.85625, |
| "reward_std": 0.2714505262672901, |
| "rewards/custom_reward_simplified_v7_dblog": 0.85625, |
| "step": 1020 |
| }, |
| { |
| "completion_length": 768.13125, |
| "epoch": 0.008203184109715597, |
| "grad_norm": 0.20112627744674683, |
| "kl": 0.006995444605126977, |
| "learning_rate": 3.5234807863152316e-06, |
| "loss": 0.0003, |
| "reward": 0.609375, |
| "reward_std": 0.2496856138110161, |
| "rewards/custom_reward_simplified_v7_dblog": 0.609375, |
| "step": 1030 |
| }, |
| { |
| "completion_length": 777.525, |
| "epoch": 0.008282826673887592, |
| "grad_norm": 0.2836349606513977, |
| "kl": 0.007392951846122741, |
| "learning_rate": 3.4901994150978926e-06, |
| "loss": 0.0003, |
| "reward": 0.675, |
| "reward_std": 0.26406350955367086, |
| "rewards/custom_reward_simplified_v7_dblog": 0.675, |
| "step": 1040 |
| }, |
| { |
| "completion_length": 719.2875, |
| "epoch": 0.008362469238059589, |
| "grad_norm": 0.1799333542585373, |
| "kl": 0.007057315914425999, |
| "learning_rate": 3.4567085809127247e-06, |
| "loss": 0.0003, |
| "reward": 0.790625, |
| "reward_std": 0.33950999528169634, |
| "rewards/custom_reward_simplified_v7_dblog": 0.790625, |
| "step": 1050 |
| }, |
| { |
| "completion_length": 621.4875, |
| "epoch": 0.008442111802231585, |
| "grad_norm": 0.25109627842903137, |
| "kl": 0.006540448497980833, |
| "learning_rate": 3.4230153682817112e-06, |
| "loss": 0.0003, |
| "reward": 0.85, |
| "reward_std": 0.30627945214509966, |
| "rewards/custom_reward_simplified_v7_dblog": 0.85, |
| "step": 1060 |
| }, |
| { |
| "completion_length": 671.04375, |
| "epoch": 0.00852175436640358, |
| "grad_norm": 0.1299162656068802, |
| "kl": 0.006574284215457737, |
| "learning_rate": 3.389126904537192e-06, |
| "loss": 0.0003, |
| "reward": 0.865625, |
| "reward_std": 0.37070034593343737, |
| "rewards/custom_reward_simplified_v7_dblog": 0.865625, |
| "step": 1070 |
| }, |
| { |
| "completion_length": 638.23125, |
| "epoch": 0.008601396930575577, |
| "grad_norm": 0.23796696960926056, |
| "kl": 0.0075248789740726355, |
| "learning_rate": 3.3550503583141726e-06, |
| "loss": 0.0003, |
| "reward": 0.746875, |
| "reward_std": 0.25020881071686746, |
| "rewards/custom_reward_simplified_v7_dblog": 0.746875, |
| "step": 1080 |
| }, |
| { |
| "completion_length": 634.9375, |
| "epoch": 0.008681039494747573, |
| "grad_norm": 0.2958204448223114, |
| "kl": 0.006533738202415406, |
| "learning_rate": 3.3207929380339034e-06, |
| "loss": 0.0003, |
| "reward": 0.896875, |
| "reward_std": 0.38549663573503495, |
| "rewards/custom_reward_simplified_v7_dblog": 0.896875, |
| "step": 1090 |
| }, |
| { |
| "completion_length": 661.2625, |
| "epoch": 0.00876068205891957, |
| "grad_norm": 0.007367302197962999, |
| "kl": 0.007355101336725056, |
| "learning_rate": 3.2863618903790346e-06, |
| "loss": 0.0003, |
| "reward": 0.71875, |
| "reward_std": 0.25932966247200967, |
| "rewards/custom_reward_simplified_v7_dblog": 0.71875, |
| "step": 1100 |
| }, |
| { |
| "completion_length": 700.48125, |
| "epoch": 0.008840324623091565, |
| "grad_norm": 0.28138336539268494, |
| "kl": 0.007267917576245964, |
| "learning_rate": 3.2517644987606827e-06, |
| "loss": 0.0003, |
| "reward": 0.9125, |
| "reward_std": 0.33715927675366403, |
| "rewards/custom_reward_simplified_v7_dblog": 0.9125, |
| "step": 1110 |
| }, |
| { |
| "completion_length": 662.26875, |
| "epoch": 0.008919967187263561, |
| "grad_norm": 0.1348627209663391, |
| "kl": 0.007481782068498433, |
| "learning_rate": 3.217008081777726e-06, |
| "loss": 0.0003, |
| "reward": 0.728125, |
| "reward_std": 0.2547163799405098, |
| "rewards/custom_reward_simplified_v7_dblog": 0.728125, |
| "step": 1120 |
| }, |
| { |
| "completion_length": 733.2125, |
| "epoch": 0.008999609751435557, |
| "grad_norm": 0.2320898026227951, |
| "kl": 0.007608366897329688, |
| "learning_rate": 3.182099991668653e-06, |
| "loss": 0.0003, |
| "reward": 0.60625, |
| "reward_std": 0.2975068032741547, |
| "rewards/custom_reward_simplified_v7_dblog": 0.60625, |
| "step": 1130 |
| }, |
| { |
| "completion_length": 603.5, |
| "epoch": 0.009079252315607553, |
| "grad_norm": 0.23401154577732086, |
| "kl": 0.007222792156971991, |
| "learning_rate": 3.147047612756302e-06, |
| "loss": 0.0003, |
| "reward": 0.875, |
| "reward_std": 0.2553515017032623, |
| "rewards/custom_reward_simplified_v7_dblog": 0.875, |
| "step": 1140 |
| }, |
| { |
| "completion_length": 704.44375, |
| "epoch": 0.009158894879779549, |
| "grad_norm": 0.2538968324661255, |
| "kl": 0.007968966104090213, |
| "learning_rate": 3.1118583598858097e-06, |
| "loss": 0.0003, |
| "reward": 0.6875, |
| "reward_std": 0.29204289317131044, |
| "rewards/custom_reward_simplified_v7_dblog": 0.6875, |
| "step": 1150 |
| }, |
| { |
| "completion_length": 641.88125, |
| "epoch": 0.009238537443951545, |
| "grad_norm": 0.007003675680607557, |
| "kl": 0.007272082474082708, |
| "learning_rate": 3.0765396768561005e-06, |
| "loss": 0.0003, |
| "reward": 0.875, |
| "reward_std": 0.2666669487953186, |
| "rewards/custom_reward_simplified_v7_dblog": 0.875, |
| "step": 1160 |
| }, |
| { |
| "completion_length": 645.55625, |
| "epoch": 0.009318180008123542, |
| "grad_norm": 0.005993107333779335, |
| "kl": 0.00769920782186091, |
| "learning_rate": 3.0410990348452572e-06, |
| "loss": 0.0003, |
| "reward": 0.846875, |
| "reward_std": 0.29315834268927576, |
| "rewards/custom_reward_simplified_v7_dblog": 0.846875, |
| "step": 1170 |
| }, |
| { |
| "completion_length": 690.65625, |
| "epoch": 0.009397822572295537, |
| "grad_norm": 0.196693554520607, |
| "kl": 0.007807633420452475, |
| "learning_rate": 3.0055439308300954e-06, |
| "loss": 0.0003, |
| "reward": 0.80625, |
| "reward_std": 0.34684801325201986, |
| "rewards/custom_reward_simplified_v7_dblog": 0.80625, |
| "step": 1180 |
| }, |
| { |
| "completion_length": 652.125, |
| "epoch": 0.009477465136467534, |
| "grad_norm": 0.009493391960859299, |
| "kl": 0.008702660608105362, |
| "learning_rate": 2.96988188600028e-06, |
| "loss": 0.0003, |
| "reward": 0.85625, |
| "reward_std": 0.21074047386646272, |
| "rewards/custom_reward_simplified_v7_dblog": 0.85625, |
| "step": 1190 |
| }, |
| { |
| "completion_length": 660.8125, |
| "epoch": 0.00955710770063953, |
| "grad_norm": 0.250519335269928, |
| "kl": 0.008729650382883846, |
| "learning_rate": 2.9341204441673267e-06, |
| "loss": 0.0003, |
| "reward": 0.728125, |
| "reward_std": 0.33152099549770353, |
| "rewards/custom_reward_simplified_v7_dblog": 0.728125, |
| "step": 1200 |
| }, |
| { |
| "completion_length": 660.7375, |
| "epoch": 0.009636750264811526, |
| "grad_norm": 0.20679971575737, |
| "kl": 0.00826664932537824, |
| "learning_rate": 2.898267170168807e-06, |
| "loss": 0.0003, |
| "reward": 0.665625, |
| "reward_std": 0.25403511226177217, |
| "rewards/custom_reward_simplified_v7_dblog": 0.665625, |
| "step": 1210 |
| }, |
| { |
| "completion_length": 653.59375, |
| "epoch": 0.009716392828983521, |
| "grad_norm": 0.14609546959400177, |
| "kl": 0.007603704649955034, |
| "learning_rate": 2.862329648268117e-06, |
| "loss": 0.0003, |
| "reward": 0.94375, |
| "reward_std": 0.26154626756906507, |
| "rewards/custom_reward_simplified_v7_dblog": 0.94375, |
| "step": 1220 |
| }, |
| { |
| "completion_length": 635.0125, |
| "epoch": 0.009796035393155518, |
| "grad_norm": 0.14301441609859467, |
| "kl": 0.008189951698295773, |
| "learning_rate": 2.82631548055013e-06, |
| "loss": 0.0003, |
| "reward": 0.9, |
| "reward_std": 0.2126667931675911, |
| "rewards/custom_reward_simplified_v7_dblog": 0.9, |
| "step": 1230 |
| }, |
| { |
| "completion_length": 816.5375, |
| "epoch": 0.009875677957327514, |
| "grad_norm": 0.1681988686323166, |
| "kl": 0.01006167777813971, |
| "learning_rate": 2.7902322853130758e-06, |
| "loss": 0.0004, |
| "reward": 0.51875, |
| "reward_std": 0.27570038065314295, |
| "rewards/custom_reward_simplified_v7_dblog": 0.51875, |
| "step": 1240 |
| }, |
| { |
| "completion_length": 710.75, |
| "epoch": 0.00995532052149951, |
| "grad_norm": 0.09834864735603333, |
| "kl": 0.010588118969462813, |
| "learning_rate": 2.754087695457005e-06, |
| "loss": 0.0004, |
| "reward": 0.6625, |
| "reward_std": 0.19232839569449425, |
| "rewards/custom_reward_simplified_v7_dblog": 0.6625, |
| "step": 1250 |
| }, |
| { |
| "completion_length": 615.5875, |
| "epoch": 0.010034963085671506, |
| "grad_norm": 0.14006367325782776, |
| "kl": 0.008278649020940065, |
| "learning_rate": 2.717889356869146e-06, |
| "loss": 0.0003, |
| "reward": 0.903125, |
| "reward_std": 0.3407335430383682, |
| "rewards/custom_reward_simplified_v7_dblog": 0.903125, |
| "step": 1260 |
| }, |
| { |
| "completion_length": 727.70625, |
| "epoch": 0.010114605649843502, |
| "grad_norm": 0.005724642425775528, |
| "kl": 0.009203878976404668, |
| "learning_rate": 2.681644926806527e-06, |
| "loss": 0.0004, |
| "reward": 0.60625, |
| "reward_std": 0.2156815566122532, |
| "rewards/custom_reward_simplified_v7_dblog": 0.60625, |
| "step": 1270 |
| }, |
| { |
| "completion_length": 641.9125, |
| "epoch": 0.0101942482140155, |
| "grad_norm": 0.21494239568710327, |
| "kl": 0.008675340004265309, |
| "learning_rate": 2.6453620722761897e-06, |
| "loss": 0.0003, |
| "reward": 0.81875, |
| "reward_std": 0.22831376343965532, |
| "rewards/custom_reward_simplified_v7_dblog": 0.81875, |
| "step": 1280 |
| }, |
| { |
| "completion_length": 650.5, |
| "epoch": 0.010273890778187494, |
| "grad_norm": 0.22972695529460907, |
| "kl": 0.008116158202756196, |
| "learning_rate": 2.6090484684133406e-06, |
| "loss": 0.0003, |
| "reward": 0.921875, |
| "reward_std": 0.2564812809228897, |
| "rewards/custom_reward_simplified_v7_dblog": 0.921875, |
| "step": 1290 |
| }, |
| { |
| "completion_length": 657.94375, |
| "epoch": 0.010353533342359491, |
| "grad_norm": 0.15338486433029175, |
| "kl": 0.009256175020709634, |
| "learning_rate": 2.572711796857779e-06, |
| "loss": 0.0004, |
| "reward": 0.709375, |
| "reward_std": 0.21537503451108933, |
| "rewards/custom_reward_simplified_v7_dblog": 0.709375, |
| "step": 1300 |
| }, |
| { |
| "completion_length": 650.58125, |
| "epoch": 0.010433175906531486, |
| "grad_norm": 0.14920295774936676, |
| "kl": 0.009564152918756008, |
| "learning_rate": 2.5363597441289574e-06, |
| "loss": 0.0004, |
| "reward": 0.828125, |
| "reward_std": 0.2882704295217991, |
| "rewards/custom_reward_simplified_v7_dblog": 0.828125, |
| "step": 1310 |
| }, |
| { |
| "completion_length": 723.89375, |
| "epoch": 0.010512818470703483, |
| "grad_norm": 0.20945711433887482, |
| "kl": 0.010788540355861187, |
| "learning_rate": 2.5e-06, |
| "loss": 0.0004, |
| "reward": 0.7125, |
| "reward_std": 0.26380954012274743, |
| "rewards/custom_reward_simplified_v7_dblog": 0.7125, |
| "step": 1320 |
| }, |
| { |
| "completion_length": 715.7, |
| "epoch": 0.010592461034875478, |
| "grad_norm": 0.16817767918109894, |
| "kl": 0.013910629483871163, |
| "learning_rate": 2.4636402558710434e-06, |
| "loss": 0.0006, |
| "reward": 0.759375, |
| "reward_std": 0.2193169414997101, |
| "rewards/custom_reward_simplified_v7_dblog": 0.759375, |
| "step": 1330 |
| }, |
| { |
| "completion_length": 655.90625, |
| "epoch": 0.010672103599047475, |
| "grad_norm": 0.2265154868364334, |
| "kl": 0.00848452327772975, |
| "learning_rate": 2.4272882031422216e-06, |
| "loss": 0.0003, |
| "reward": 0.78125, |
| "reward_std": 0.3443989932537079, |
| "rewards/custom_reward_simplified_v7_dblog": 0.78125, |
| "step": 1340 |
| }, |
| { |
| "completion_length": 660.075, |
| "epoch": 0.01075174616321947, |
| "grad_norm": 0.24644052982330322, |
| "kl": 0.009867909434251487, |
| "learning_rate": 2.3909515315866606e-06, |
| "loss": 0.0004, |
| "reward": 0.79375, |
| "reward_std": 0.29604131579399107, |
| "rewards/custom_reward_simplified_v7_dblog": 0.79375, |
| "step": 1350 |
| }, |
| { |
| "completion_length": 633.9125, |
| "epoch": 0.010831388727391467, |
| "grad_norm": 0.1637645810842514, |
| "kl": 0.00936238830909133, |
| "learning_rate": 2.3546379277238107e-06, |
| "loss": 0.0004, |
| "reward": 0.90625, |
| "reward_std": 0.324691192060709, |
| "rewards/custom_reward_simplified_v7_dblog": 0.90625, |
| "step": 1360 |
| }, |
| { |
| "completion_length": 674.03125, |
| "epoch": 0.010911031291563462, |
| "grad_norm": 0.2471015304327011, |
| "kl": 0.010726678185164928, |
| "learning_rate": 2.318355073193474e-06, |
| "loss": 0.0004, |
| "reward": 0.65625, |
| "reward_std": 0.21728940233588218, |
| "rewards/custom_reward_simplified_v7_dblog": 0.65625, |
| "step": 1370 |
| }, |
| { |
| "completion_length": 682.31875, |
| "epoch": 0.01099067385573546, |
| "grad_norm": 0.10079372674226761, |
| "kl": 0.009952771244570613, |
| "learning_rate": 2.2821106431308546e-06, |
| "loss": 0.0004, |
| "reward": 0.89375, |
| "reward_std": 0.33092204555869104, |
| "rewards/custom_reward_simplified_v7_dblog": 0.89375, |
| "step": 1380 |
| }, |
| { |
| "completion_length": 669.09375, |
| "epoch": 0.011070316419907454, |
| "grad_norm": 0.19604210555553436, |
| "kl": 0.011396997445262968, |
| "learning_rate": 2.2459123045429953e-06, |
| "loss": 0.0005, |
| "reward": 0.784375, |
| "reward_std": 0.29770964160561564, |
| "rewards/custom_reward_simplified_v7_dblog": 0.784375, |
| "step": 1390 |
| }, |
| { |
| "completion_length": 651.80625, |
| "epoch": 0.011149958984079451, |
| "grad_norm": 0.27397212386131287, |
| "kl": 0.01038803206756711, |
| "learning_rate": 2.2097677146869242e-06, |
| "loss": 0.0004, |
| "reward": 0.878125, |
| "reward_std": 0.27883157432079314, |
| "rewards/custom_reward_simplified_v7_dblog": 0.878125, |
| "step": 1400 |
| }, |
| { |
| "completion_length": 687.5125, |
| "epoch": 0.011229601548251448, |
| "grad_norm": 0.22397036850452423, |
| "kl": 0.012094876240007579, |
| "learning_rate": 2.173684519449872e-06, |
| "loss": 0.0005, |
| "reward": 0.834375, |
| "reward_std": 0.28866922557353974, |
| "rewards/custom_reward_simplified_v7_dblog": 0.834375, |
| "step": 1410 |
| }, |
| { |
| "completion_length": 661.26875, |
| "epoch": 0.011309244112423443, |
| "grad_norm": 0.2519758939743042, |
| "kl": 0.011373027227818966, |
| "learning_rate": 2.1376703517318835e-06, |
| "loss": 0.0005, |
| "reward": 0.853125, |
| "reward_std": 0.32343359887599943, |
| "rewards/custom_reward_simplified_v7_dblog": 0.853125, |
| "step": 1420 |
| }, |
| { |
| "completion_length": 677.73125, |
| "epoch": 0.01138888667659544, |
| "grad_norm": 0.2689824104309082, |
| "kl": 0.011312256497330964, |
| "learning_rate": 2.101732829831194e-06, |
| "loss": 0.0005, |
| "reward": 0.765625, |
| "reward_std": 0.27808423787355424, |
| "rewards/custom_reward_simplified_v7_dblog": 0.765625, |
| "step": 1430 |
| }, |
| { |
| "completion_length": 619.71875, |
| "epoch": 0.011468529240767435, |
| "grad_norm": 0.32441073656082153, |
| "kl": 0.010685316193848849, |
| "learning_rate": 2.0658795558326745e-06, |
| "loss": 0.0004, |
| "reward": 0.871875, |
| "reward_std": 0.2622031569480896, |
| "rewards/custom_reward_simplified_v7_dblog": 0.871875, |
| "step": 1440 |
| }, |
| { |
| "completion_length": 613.3875, |
| "epoch": 0.011548171804939432, |
| "grad_norm": 0.15561087429523468, |
| "kl": 0.012302201450802385, |
| "learning_rate": 2.0301181139997206e-06, |
| "loss": 0.0005, |
| "reward": 0.8125, |
| "reward_std": 0.26520399302244185, |
| "rewards/custom_reward_simplified_v7_dblog": 0.8125, |
| "step": 1450 |
| }, |
| { |
| "completion_length": 677.23125, |
| "epoch": 0.011627814369111427, |
| "grad_norm": 0.2590673267841339, |
| "kl": 0.011339499452151357, |
| "learning_rate": 1.994456069169906e-06, |
| "loss": 0.0005, |
| "reward": 0.64375, |
| "reward_std": 0.23993425220251083, |
| "rewards/custom_reward_simplified_v7_dblog": 0.64375, |
| "step": 1460 |
| }, |
| { |
| "completion_length": 702.1625, |
| "epoch": 0.011707456933283424, |
| "grad_norm": 0.012393876910209656, |
| "kl": 0.012036008480936288, |
| "learning_rate": 1.958900965154743e-06, |
| "loss": 0.0005, |
| "reward": 0.64375, |
| "reward_std": 0.21832374781370162, |
| "rewards/custom_reward_simplified_v7_dblog": 0.64375, |
| "step": 1470 |
| }, |
| { |
| "completion_length": 722.06875, |
| "epoch": 0.01178709949745542, |
| "grad_norm": 0.13200955092906952, |
| "kl": 0.013854384049773216, |
| "learning_rate": 1.9234603231439e-06, |
| "loss": 0.0006, |
| "reward": 0.790625, |
| "reward_std": 0.2784456007182598, |
| "rewards/custom_reward_simplified_v7_dblog": 0.790625, |
| "step": 1480 |
| }, |
| { |
| "completion_length": 664.46875, |
| "epoch": 0.011866742061627416, |
| "grad_norm": 0.14230677485466003, |
| "kl": 0.012553655169904232, |
| "learning_rate": 1.8881416401141905e-06, |
| "loss": 0.0005, |
| "reward": 0.9, |
| "reward_std": 0.23252918049693108, |
| "rewards/custom_reward_simplified_v7_dblog": 0.9, |
| "step": 1490 |
| }, |
| { |
| "completion_length": 653.79375, |
| "epoch": 0.011946384625799411, |
| "grad_norm": 0.17014774680137634, |
| "kl": 0.01346926314290613, |
| "learning_rate": 1.852952387243698e-06, |
| "loss": 0.0005, |
| "reward": 0.740625, |
| "reward_std": 0.22115055918693544, |
| "rewards/custom_reward_simplified_v7_dblog": 0.740625, |
| "step": 1500 |
| }, |
| { |
| "completion_length": 640.39375, |
| "epoch": 0.012026027189971408, |
| "grad_norm": 0.17104946076869965, |
| "kl": 0.013007838977500796, |
| "learning_rate": 1.8179000083313483e-06, |
| "loss": 0.0005, |
| "reward": 0.9, |
| "reward_std": 0.28725912123918534, |
| "rewards/custom_reward_simplified_v7_dblog": 0.9, |
| "step": 1510 |
| }, |
| { |
| "completion_length": 650.7125, |
| "epoch": 0.012105669754143405, |
| "grad_norm": 0.1524449735879898, |
| "kl": 0.012339419685304165, |
| "learning_rate": 1.7829919182222752e-06, |
| "loss": 0.0005, |
| "reward": 0.790625, |
| "reward_std": 0.3324665643274784, |
| "rewards/custom_reward_simplified_v7_dblog": 0.790625, |
| "step": 1520 |
| }, |
| { |
| "completion_length": 674.3625, |
| "epoch": 0.0121853123183154, |
| "grad_norm": 0.2344941943883896, |
| "kl": 0.012514100456610323, |
| "learning_rate": 1.7482355012393177e-06, |
| "loss": 0.0005, |
| "reward": 0.859375, |
| "reward_std": 0.3387090668082237, |
| "rewards/custom_reward_simplified_v7_dblog": 0.859375, |
| "step": 1530 |
| }, |
| { |
| "completion_length": 718.6, |
| "epoch": 0.012264954882487397, |
| "grad_norm": 0.2631664276123047, |
| "kl": 0.014576551388017833, |
| "learning_rate": 1.7136381096209665e-06, |
| "loss": 0.0006, |
| "reward": 0.653125, |
| "reward_std": 0.24619419425725936, |
| "rewards/custom_reward_simplified_v7_dblog": 0.653125, |
| "step": 1540 |
| }, |
| { |
| "completion_length": 706.28125, |
| "epoch": 0.012344597446659392, |
| "grad_norm": 0.20134921371936798, |
| "kl": 0.012202254333533346, |
| "learning_rate": 1.6792070619660977e-06, |
| "loss": 0.0005, |
| "reward": 0.84375, |
| "reward_std": 0.3321776181459427, |
| "rewards/custom_reward_simplified_v7_dblog": 0.84375, |
| "step": 1550 |
| }, |
| { |
| "completion_length": 645.28125, |
| "epoch": 0.01242424001083139, |
| "grad_norm": 0.1851159930229187, |
| "kl": 0.014482964109629393, |
| "learning_rate": 1.6449496416858285e-06, |
| "loss": 0.0006, |
| "reward": 0.85625, |
| "reward_std": 0.20507382601499557, |
| "rewards/custom_reward_simplified_v7_dblog": 0.85625, |
| "step": 1560 |
| }, |
| { |
| "completion_length": 614.08125, |
| "epoch": 0.012503882575003384, |
| "grad_norm": 0.27418458461761475, |
| "kl": 0.013118641986511647, |
| "learning_rate": 1.6108730954628093e-06, |
| "loss": 0.0005, |
| "reward": 0.79375, |
| "reward_std": 0.2820776253938675, |
| "rewards/custom_reward_simplified_v7_dblog": 0.79375, |
| "step": 1570 |
| }, |
| { |
| "completion_length": 695.91875, |
| "epoch": 0.012583525139175381, |
| "grad_norm": 0.2425900250673294, |
| "kl": 0.013333506928756834, |
| "learning_rate": 1.5769846317182894e-06, |
| "loss": 0.0005, |
| "reward": 0.7625, |
| "reward_std": 0.2879462748765945, |
| "rewards/custom_reward_simplified_v7_dblog": 0.7625, |
| "step": 1580 |
| }, |
| { |
| "completion_length": 673.99375, |
| "epoch": 0.012663167703347376, |
| "grad_norm": 0.2331763356924057, |
| "kl": 0.013234515953809024, |
| "learning_rate": 1.5432914190872757e-06, |
| "loss": 0.0005, |
| "reward": 0.775, |
| "reward_std": 0.2913659870624542, |
| "rewards/custom_reward_simplified_v7_dblog": 0.775, |
| "step": 1590 |
| }, |
| { |
| "completion_length": 678.54375, |
| "epoch": 0.012742810267519373, |
| "grad_norm": 0.16657988727092743, |
| "kl": 0.012798944069072605, |
| "learning_rate": 1.509800584902108e-06, |
| "loss": 0.0005, |
| "reward": 0.759375, |
| "reward_std": 0.2901748239994049, |
| "rewards/custom_reward_simplified_v7_dblog": 0.759375, |
| "step": 1600 |
| }, |
| { |
| "completion_length": 652.49375, |
| "epoch": 0.012822452831691368, |
| "grad_norm": 0.12168209999799728, |
| "kl": 0.012750855972990393, |
| "learning_rate": 1.4765192136847686e-06, |
| "loss": 0.0005, |
| "reward": 0.728125, |
| "reward_std": 0.26915703564882276, |
| "rewards/custom_reward_simplified_v7_dblog": 0.728125, |
| "step": 1610 |
| }, |
| { |
| "completion_length": 660.95625, |
| "epoch": 0.012902095395863365, |
| "grad_norm": 0.13546766340732574, |
| "kl": 0.013546877074986696, |
| "learning_rate": 1.443454345648252e-06, |
| "loss": 0.0005, |
| "reward": 0.790625, |
| "reward_std": 0.1937400370836258, |
| "rewards/custom_reward_simplified_v7_dblog": 0.790625, |
| "step": 1620 |
| }, |
| { |
| "completion_length": 638.00625, |
| "epoch": 0.012981737960035362, |
| "grad_norm": 0.17955924570560455, |
| "kl": 0.012779112858697771, |
| "learning_rate": 1.4106129752073023e-06, |
| "loss": 0.0005, |
| "reward": 0.790625, |
| "reward_std": 0.2674853280186653, |
| "rewards/custom_reward_simplified_v7_dblog": 0.790625, |
| "step": 1630 |
| }, |
| { |
| "completion_length": 678.1125, |
| "epoch": 0.013061380524207357, |
| "grad_norm": 0.2616170644760132, |
| "kl": 0.01720189054030925, |
| "learning_rate": 1.3780020494988447e-06, |
| "loss": 0.0007, |
| "reward": 0.771875, |
| "reward_std": 0.27255760729312895, |
| "rewards/custom_reward_simplified_v7_dblog": 0.771875, |
| "step": 1640 |
| }, |
| { |
| "completion_length": 639.43125, |
| "epoch": 0.013141023088379354, |
| "grad_norm": 0.1487816423177719, |
| "kl": 0.014415727299638092, |
| "learning_rate": 1.3456284669124159e-06, |
| "loss": 0.0006, |
| "reward": 0.73125, |
| "reward_std": 0.24298151433467866, |
| "rewards/custom_reward_simplified_v7_dblog": 0.73125, |
| "step": 1650 |
| }, |
| { |
| "completion_length": 727.9625, |
| "epoch": 0.01322066565255135, |
| "grad_norm": 0.14750860631465912, |
| "kl": 0.018067248188890515, |
| "learning_rate": 1.313499075630899e-06, |
| "loss": 0.0007, |
| "reward": 0.721875, |
| "reward_std": 0.30838647186756135, |
| "rewards/custom_reward_simplified_v7_dblog": 0.721875, |
| "step": 1660 |
| }, |
| { |
| "completion_length": 780.08125, |
| "epoch": 0.013300308216723346, |
| "grad_norm": 0.2386309951543808, |
| "kl": 0.017110610962845385, |
| "learning_rate": 1.2816206721818944e-06, |
| "loss": 0.0007, |
| "reward": 0.6375, |
| "reward_std": 0.26727318242192266, |
| "rewards/custom_reward_simplified_v7_dblog": 0.6375, |
| "step": 1670 |
| }, |
| { |
| "completion_length": 655.70625, |
| "epoch": 0.013379950780895341, |
| "grad_norm": 0.2751936614513397, |
| "kl": 0.01622524333652109, |
| "learning_rate": 1.2500000000000007e-06, |
| "loss": 0.0006, |
| "reward": 0.878125, |
| "reward_std": 0.284642493724823, |
| "rewards/custom_reward_simplified_v7_dblog": 0.878125, |
| "step": 1680 |
| }, |
| { |
| "completion_length": 684.98125, |
| "epoch": 0.013459593345067338, |
| "grad_norm": 0.23118546605110168, |
| "kl": 0.01642036633566022, |
| "learning_rate": 1.218643748000337e-06, |
| "loss": 0.0007, |
| "reward": 0.85625, |
| "reward_std": 0.339317075163126, |
| "rewards/custom_reward_simplified_v7_dblog": 0.85625, |
| "step": 1690 |
| }, |
| { |
| "completion_length": 743.51875, |
| "epoch": 0.013539235909239333, |
| "grad_norm": 0.22867274284362793, |
| "kl": 0.01721250016707927, |
| "learning_rate": 1.1875585491636e-06, |
| "loss": 0.0007, |
| "reward": 0.653125, |
| "reward_std": 0.277196903526783, |
| "rewards/custom_reward_simplified_v7_dblog": 0.653125, |
| "step": 1700 |
| }, |
| { |
| "completion_length": 637.9625, |
| "epoch": 0.01361887847341133, |
| "grad_norm": 0.2428259700536728, |
| "kl": 0.014563425956293941, |
| "learning_rate": 1.1567509791329402e-06, |
| "loss": 0.0006, |
| "reward": 0.865625, |
| "reward_std": 0.23967689424753189, |
| "rewards/custom_reward_simplified_v7_dblog": 0.865625, |
| "step": 1710 |
| }, |
| { |
| "completion_length": 722.925, |
| "epoch": 0.013698521037583325, |
| "grad_norm": 0.21737752854824066, |
| "kl": 0.014987437543459237, |
| "learning_rate": 1.1262275548229852e-06, |
| "loss": 0.0006, |
| "reward": 0.725, |
| "reward_std": 0.26179009675979614, |
| "rewards/custom_reward_simplified_v7_dblog": 0.725, |
| "step": 1720 |
| }, |
| { |
| "completion_length": 633.31875, |
| "epoch": 0.013778163601755322, |
| "grad_norm": 0.22654354572296143, |
| "kl": 0.013244283269159496, |
| "learning_rate": 1.0959947330412681e-06, |
| "loss": 0.0005, |
| "reward": 0.921875, |
| "reward_std": 0.2066536843776703, |
| "rewards/custom_reward_simplified_v7_dblog": 0.921875, |
| "step": 1730 |
| }, |
| { |
| "completion_length": 615.29375, |
| "epoch": 0.013857806165927319, |
| "grad_norm": 0.22673261165618896, |
| "kl": 0.014753601653501392, |
| "learning_rate": 1.0660589091223854e-06, |
| "loss": 0.0006, |
| "reward": 0.815625, |
| "reward_std": 0.30853241235017775, |
| "rewards/custom_reward_simplified_v7_dblog": 0.815625, |
| "step": 1740 |
| }, |
| { |
| "completion_length": 630.3625, |
| "epoch": 0.013937448730099314, |
| "grad_norm": 0.012196751311421394, |
| "kl": 0.01440229129511863, |
| "learning_rate": 1.0364264155751489e-06, |
| "loss": 0.0006, |
| "reward": 0.915625, |
| "reward_std": 0.23927971720695496, |
| "rewards/custom_reward_simplified_v7_dblog": 0.915625, |
| "step": 1750 |
| }, |
| { |
| "completion_length": 715.2125, |
| "epoch": 0.014017091294271311, |
| "grad_norm": 0.2587921619415283, |
| "kl": 0.017100332980044188, |
| "learning_rate": 1.0071035207430352e-06, |
| "loss": 0.0007, |
| "reward": 0.74375, |
| "reward_std": 0.2990465022623539, |
| "rewards/custom_reward_simplified_v7_dblog": 0.74375, |
| "step": 1760 |
| }, |
| { |
| "completion_length": 682.74375, |
| "epoch": 0.014096733858443306, |
| "grad_norm": 0.24313370883464813, |
| "kl": 0.01778110705781728, |
| "learning_rate": 9.780964274781984e-07, |
| "loss": 0.0007, |
| "reward": 0.68125, |
| "reward_std": 0.2005969136953354, |
| "rewards/custom_reward_simplified_v7_dblog": 0.68125, |
| "step": 1770 |
| }, |
| { |
| "completion_length": 718.31875, |
| "epoch": 0.014176376422615303, |
| "grad_norm": 0.18841393291950226, |
| "kl": 0.015946343122050167, |
| "learning_rate": 9.494112718293503e-07, |
| "loss": 0.0006, |
| "reward": 0.771875, |
| "reward_std": 0.27307887077331544, |
| "rewards/custom_reward_simplified_v7_dblog": 0.771875, |
| "step": 1780 |
| }, |
| { |
| "completion_length": 707.1875, |
| "epoch": 0.014256018986787298, |
| "grad_norm": 0.2333621084690094, |
| "kl": 0.01652351173106581, |
| "learning_rate": 9.210541217437566e-07, |
| "loss": 0.0007, |
| "reward": 0.8125, |
| "reward_std": 0.2497081995010376, |
| "rewards/custom_reward_simplified_v7_dblog": 0.8125, |
| "step": 1790 |
| }, |
| { |
| "completion_length": 728.5375, |
| "epoch": 0.014335661550959295, |
| "grad_norm": 0.26783886551856995, |
| "kl": 0.018553019547834993, |
| "learning_rate": 8.930309757836517e-07, |
| "loss": 0.0007, |
| "reward": 0.75, |
| "reward_std": 0.28967257887125014, |
| "rewards/custom_reward_simplified_v7_dblog": 0.75, |
| "step": 1800 |
| }, |
| { |
| "completion_length": 689.26875, |
| "epoch": 0.01441530411513129, |
| "grad_norm": 0.17589329183101654, |
| "kl": 0.016255489736795425, |
| "learning_rate": 8.653477618573261e-07, |
| "loss": 0.0007, |
| "reward": 0.765625, |
| "reward_std": 0.3363394603133202, |
| "rewards/custom_reward_simplified_v7_dblog": 0.765625, |
| "step": 1810 |
| }, |
| { |
| "completion_length": 640.91875, |
| "epoch": 0.014494946679303287, |
| "grad_norm": 0.21075929701328278, |
| "kl": 0.015922663966193795, |
| "learning_rate": 8.380103359651554e-07, |
| "loss": 0.0006, |
| "reward": 0.925, |
| "reward_std": 0.3459245666861534, |
| "rewards/custom_reward_simplified_v7_dblog": 0.925, |
| "step": 1820 |
| }, |
| { |
| "completion_length": 708.60625, |
| "epoch": 0.014574589243475282, |
| "grad_norm": 0.00766308419406414, |
| "kl": 0.01772608202882111, |
| "learning_rate": 8.110244809608494e-07, |
| "loss": 0.0007, |
| "reward": 0.73125, |
| "reward_std": 0.2913930006325245, |
| "rewards/custom_reward_simplified_v7_dblog": 0.73125, |
| "step": 1830 |
| }, |
| { |
| "completion_length": 660.0375, |
| "epoch": 0.014654231807647279, |
| "grad_norm": 0.20974037051200867, |
| "kl": 0.014227323909290135, |
| "learning_rate": 7.843959053281663e-07, |
| "loss": 0.0006, |
| "reward": 0.809375, |
| "reward_std": 0.24926668480038644, |
| "rewards/custom_reward_simplified_v7_dblog": 0.809375, |
| "step": 1840 |
| }, |
| { |
| "completion_length": 729.71875, |
| "epoch": 0.014733874371819274, |
| "grad_norm": 0.24099427461624146, |
| "kl": 0.018935651518404484, |
| "learning_rate": 7.581302419733633e-07, |
| "loss": 0.0008, |
| "reward": 0.690625, |
| "reward_std": 0.32810748890042307, |
| "rewards/custom_reward_simplified_v7_dblog": 0.690625, |
| "step": 1850 |
| }, |
| { |
| "completion_length": 649.98125, |
| "epoch": 0.014813516935991271, |
| "grad_norm": 0.013280795887112617, |
| "kl": 0.01633880774024874, |
| "learning_rate": 7.322330470336314e-07, |
| "loss": 0.0007, |
| "reward": 0.91875, |
| "reward_std": 0.24432293996214866, |
| "rewards/custom_reward_simplified_v7_dblog": 0.91875, |
| "step": 1860 |
| }, |
| { |
| "completion_length": 669.09375, |
| "epoch": 0.014893159500163268, |
| "grad_norm": 0.2837064266204834, |
| "kl": 0.014348302804864942, |
| "learning_rate": 7.067097987017762e-07, |
| "loss": 0.0006, |
| "reward": 0.690625, |
| "reward_std": 0.2307182878255844, |
| "rewards/custom_reward_simplified_v7_dblog": 0.690625, |
| "step": 1870 |
| }, |
| { |
| "completion_length": 662.9625, |
| "epoch": 0.014972802064335263, |
| "grad_norm": 0.25689443945884705, |
| "kl": 0.01656266492791474, |
| "learning_rate": 6.815658960673782e-07, |
| "loss": 0.0007, |
| "reward": 0.85625, |
| "reward_std": 0.22758262380957603, |
| "rewards/custom_reward_simplified_v7_dblog": 0.85625, |
| "step": 1880 |
| }, |
| { |
| "completion_length": 719.24375, |
| "epoch": 0.01505244462850726, |
| "grad_norm": 0.22542421519756317, |
| "kl": 0.01744127394631505, |
| "learning_rate": 6.568066579746901e-07, |
| "loss": 0.0007, |
| "reward": 0.76875, |
| "reward_std": 0.2790658660233021, |
| "rewards/custom_reward_simplified_v7_dblog": 0.76875, |
| "step": 1890 |
| }, |
| { |
| "completion_length": 633.64375, |
| "epoch": 0.015132087192679255, |
| "grad_norm": 0.00903425831347704, |
| "kl": 0.014375879801809788, |
| "learning_rate": 6.324373218975105e-07, |
| "loss": 0.0006, |
| "reward": 0.725, |
| "reward_std": 0.2382744610309601, |
| "rewards/custom_reward_simplified_v7_dblog": 0.725, |
| "step": 1900 |
| }, |
| { |
| "completion_length": 767.7375, |
| "epoch": 0.015211729756851252, |
| "grad_norm": 0.1330222189426422, |
| "kl": 0.02190765142440796, |
| "learning_rate": 6.084630428312679e-07, |
| "loss": 0.0009, |
| "reward": 0.66875, |
| "reward_std": 0.27546602860093117, |
| "rewards/custom_reward_simplified_v7_dblog": 0.66875, |
| "step": 1910 |
| }, |
| { |
| "completion_length": 726.63125, |
| "epoch": 0.015291372321023247, |
| "grad_norm": 0.21655875444412231, |
| "kl": 0.02581467442214489, |
| "learning_rate": 5.848888922025553e-07, |
| "loss": 0.001, |
| "reward": 0.834375, |
| "reward_std": 0.38373097851872445, |
| "rewards/custom_reward_simplified_v7_dblog": 0.834375, |
| "step": 1920 |
| }, |
| { |
| "completion_length": 688.56875, |
| "epoch": 0.015371014885195244, |
| "grad_norm": 0.22155120968818665, |
| "kl": 0.025313653564080597, |
| "learning_rate": 5.617198567963353e-07, |
| "loss": 0.001, |
| "reward": 0.64375, |
| "reward_std": 0.2539114162325859, |
| "rewards/custom_reward_simplified_v7_dblog": 0.64375, |
| "step": 1930 |
| }, |
| { |
| "completion_length": 676.9125, |
| "epoch": 0.01545065744936724, |
| "grad_norm": 0.2373446673154831, |
| "kl": 0.018907574540935456, |
| "learning_rate": 5.389608377010608e-07, |
| "loss": 0.0008, |
| "reward": 0.821875, |
| "reward_std": 0.1906539335846901, |
| "rewards/custom_reward_simplified_v7_dblog": 0.821875, |
| "step": 1940 |
| }, |
| { |
| "completion_length": 640.675, |
| "epoch": 0.015530300013539236, |
| "grad_norm": 0.1865774542093277, |
| "kl": 0.014899229886941612, |
| "learning_rate": 5.166166492719124e-07, |
| "loss": 0.0006, |
| "reward": 0.725, |
| "reward_std": 0.2747412838041782, |
| "rewards/custom_reward_simplified_v7_dblog": 0.725, |
| "step": 1950 |
| }, |
| { |
| "completion_length": 651.70625, |
| "epoch": 0.015609942577711231, |
| "grad_norm": 0.2434624284505844, |
| "kl": 0.01636054664850235, |
| "learning_rate": 4.946920181123904e-07, |
| "loss": 0.0007, |
| "reward": 0.7625, |
| "reward_std": 0.2852359592914581, |
| "rewards/custom_reward_simplified_v7_dblog": 0.7625, |
| "step": 1960 |
| }, |
| { |
| "completion_length": 654.6625, |
| "epoch": 0.015689585141883226, |
| "grad_norm": 0.20749981701374054, |
| "kl": 0.018196922447532415, |
| "learning_rate": 4.7319158207446953e-07, |
| "loss": 0.0007, |
| "reward": 0.715625, |
| "reward_std": 0.2198973834514618, |
| "rewards/custom_reward_simplified_v7_dblog": 0.715625, |
| "step": 1970 |
| }, |
| { |
| "completion_length": 641.45, |
| "epoch": 0.015769227706055225, |
| "grad_norm": 0.23187489807605743, |
| "kl": 0.017989515024237335, |
| "learning_rate": 4.5211988927752026e-07, |
| "loss": 0.0007, |
| "reward": 0.7875, |
| "reward_std": 0.24450960606336594, |
| "rewards/custom_reward_simplified_v7_dblog": 0.7875, |
| "step": 1980 |
| }, |
| { |
| "completion_length": 643.6375, |
| "epoch": 0.01584887027022722, |
| "grad_norm": 0.235895574092865, |
| "kl": 0.015841626143082977, |
| "learning_rate": 4.3148139714622365e-07, |
| "loss": 0.0006, |
| "reward": 0.896875, |
| "reward_std": 0.26189937368035315, |
| "rewards/custom_reward_simplified_v7_dblog": 0.896875, |
| "step": 1990 |
| }, |
| { |
| "completion_length": 629.60625, |
| "epoch": 0.015928512834399215, |
| "grad_norm": 0.2776155471801758, |
| "kl": 0.015184593386948109, |
| "learning_rate": 4.1128047146765936e-07, |
| "loss": 0.0006, |
| "reward": 0.921875, |
| "reward_std": 0.23378355875611306, |
| "rewards/custom_reward_simplified_v7_dblog": 0.921875, |
| "step": 2000 |
| }, |
| { |
| "completion_length": 710.65, |
| "epoch": 0.016008155398571214, |
| "grad_norm": 0.13598495721817017, |
| "kl": 0.01561300114262849, |
| "learning_rate": 3.915213854677863e-07, |
| "loss": 0.0006, |
| "reward": 0.859375, |
| "reward_std": 0.22324086129665374, |
| "rewards/custom_reward_simplified_v7_dblog": 0.859375, |
| "step": 2010 |
| }, |
| { |
| "completion_length": 600.3625, |
| "epoch": 0.01608779796274321, |
| "grad_norm": 0.33102965354919434, |
| "kl": 0.01562973433174193, |
| "learning_rate": 3.722083189075007e-07, |
| "loss": 0.0006, |
| "reward": 1.0125, |
| "reward_std": 0.37898894101381303, |
| "rewards/custom_reward_simplified_v7_dblog": 1.0125, |
| "step": 2020 |
| }, |
| { |
| "completion_length": 633.40625, |
| "epoch": 0.016167440526915204, |
| "grad_norm": 0.009714637883007526, |
| "kl": 0.01524353977292776, |
| "learning_rate": 3.5334535719846767e-07, |
| "loss": 0.0006, |
| "reward": 0.775, |
| "reward_std": 0.1905590772628784, |
| "rewards/custom_reward_simplified_v7_dblog": 0.775, |
| "step": 2030 |
| }, |
| { |
| "completion_length": 674.3625, |
| "epoch": 0.0162470830910872, |
| "grad_norm": 0.2587895095348358, |
| "kl": 0.015684280125424267, |
| "learning_rate": 3.3493649053890325e-07, |
| "loss": 0.0006, |
| "reward": 0.978125, |
| "reward_std": 0.33772673830389977, |
| "rewards/custom_reward_simplified_v7_dblog": 0.978125, |
| "step": 2040 |
| }, |
| { |
| "completion_length": 623.1375, |
| "epoch": 0.016326725655259198, |
| "grad_norm": 0.24910244345664978, |
| "kl": 0.014677197439596057, |
| "learning_rate": 3.1698561306951065e-07, |
| "loss": 0.0006, |
| "reward": 0.925, |
| "reward_std": 0.3512172996997833, |
| "rewards/custom_reward_simplified_v7_dblog": 0.925, |
| "step": 2050 |
| }, |
| { |
| "completion_length": 678.0375, |
| "epoch": 0.016406368219431193, |
| "grad_norm": 0.20536966621875763, |
| "kl": 0.017746813944540918, |
| "learning_rate": 2.9949652204972257e-07, |
| "loss": 0.0007, |
| "reward": 0.828125, |
| "reward_std": 0.34475562572479246, |
| "rewards/custom_reward_simplified_v7_dblog": 0.828125, |
| "step": 2060 |
| }, |
| { |
| "completion_length": 634.36875, |
| "epoch": 0.016486010783603188, |
| "grad_norm": 0.26798176765441895, |
| "kl": 0.017110086302272974, |
| "learning_rate": 2.8247291705444575e-07, |
| "loss": 0.0007, |
| "reward": 0.89375, |
| "reward_std": 0.24814453721046448, |
| "rewards/custom_reward_simplified_v7_dblog": 0.89375, |
| "step": 2070 |
| }, |
| { |
| "completion_length": 710.6875, |
| "epoch": 0.016565653347775183, |
| "grad_norm": 0.20649504661560059, |
| "kl": 0.018557686172425748, |
| "learning_rate": 2.6591839919146963e-07, |
| "loss": 0.0007, |
| "reward": 0.828125, |
| "reward_std": 0.34967463091015816, |
| "rewards/custom_reward_simplified_v7_dblog": 0.828125, |
| "step": 2080 |
| }, |
| { |
| "completion_length": 642.375, |
| "epoch": 0.016645295911947182, |
| "grad_norm": 0.016043314710259438, |
| "kl": 0.018814650364220142, |
| "learning_rate": 2.4983647033969714e-07, |
| "loss": 0.0008, |
| "reward": 0.859375, |
| "reward_std": 0.3110216066241264, |
| "rewards/custom_reward_simplified_v7_dblog": 0.859375, |
| "step": 2090 |
| }, |
| { |
| "completion_length": 686.65625, |
| "epoch": 0.016724938476119177, |
| "grad_norm": 0.26343393325805664, |
| "kl": 0.019906887435354292, |
| "learning_rate": 2.3423053240837518e-07, |
| "loss": 0.0008, |
| "reward": 0.715625, |
| "reward_std": 0.17099330350756645, |
| "rewards/custom_reward_simplified_v7_dblog": 0.715625, |
| "step": 2100 |
| }, |
| { |
| "completion_length": 656.8, |
| "epoch": 0.016804581040291172, |
| "grad_norm": 0.01307599525898695, |
| "kl": 0.020065448177047075, |
| "learning_rate": 2.1910388661746495e-07, |
| "loss": 0.0008, |
| "reward": 0.8, |
| "reward_std": 0.20212240219116212, |
| "rewards/custom_reward_simplified_v7_dblog": 0.8, |
| "step": 2110 |
| }, |
| { |
| "completion_length": 714.25625, |
| "epoch": 0.01688422360446317, |
| "grad_norm": 0.2202935814857483, |
| "kl": 0.02329984272364527, |
| "learning_rate": 2.044597327993153e-07, |
| "loss": 0.0009, |
| "reward": 0.7875, |
| "reward_std": 0.307485481351614, |
| "rewards/custom_reward_simplified_v7_dblog": 0.7875, |
| "step": 2120 |
| }, |
| { |
| "completion_length": 685.39375, |
| "epoch": 0.016963866168635166, |
| "grad_norm": 0.30204537510871887, |
| "kl": 0.018967814440838993, |
| "learning_rate": 1.9030116872178317e-07, |
| "loss": 0.0008, |
| "reward": 0.803125, |
| "reward_std": 0.3279333204030991, |
| "rewards/custom_reward_simplified_v7_dblog": 0.803125, |
| "step": 2130 |
| }, |
| { |
| "completion_length": 674.49375, |
| "epoch": 0.01704350873280716, |
| "grad_norm": 0.012012571096420288, |
| "kl": 0.02170075795147568, |
| "learning_rate": 1.7663118943294367e-07, |
| "loss": 0.0009, |
| "reward": 0.703125, |
| "reward_std": 0.2257047951221466, |
| "rewards/custom_reward_simplified_v7_dblog": 0.703125, |
| "step": 2140 |
| }, |
| { |
| "completion_length": 694.63125, |
| "epoch": 0.017123151296979156, |
| "grad_norm": 0.01635037176311016, |
| "kl": 0.02094450539443642, |
| "learning_rate": 1.6345268662752904e-07, |
| "loss": 0.0008, |
| "reward": 0.7125, |
| "reward_std": 0.2917635254561901, |
| "rewards/custom_reward_simplified_v7_dblog": 0.7125, |
| "step": 2150 |
| }, |
| { |
| "completion_length": 702.025, |
| "epoch": 0.017202793861151155, |
| "grad_norm": 0.008707295171916485, |
| "kl": 0.01914967515040189, |
| "learning_rate": 1.507684480352292e-07, |
| "loss": 0.0008, |
| "reward": 0.821875, |
| "reward_std": 0.2691307656466961, |
| "rewards/custom_reward_simplified_v7_dblog": 0.821875, |
| "step": 2160 |
| }, |
| { |
| "completion_length": 704.90625, |
| "epoch": 0.01728243642532315, |
| "grad_norm": 0.1347748190164566, |
| "kl": 0.017809830722399056, |
| "learning_rate": 1.3858115683098832e-07, |
| "loss": 0.0007, |
| "reward": 0.9, |
| "reward_std": 0.30937733352184293, |
| "rewards/custom_reward_simplified_v7_dblog": 0.9, |
| "step": 2170 |
| }, |
| { |
| "completion_length": 650.13125, |
| "epoch": 0.017362078989495145, |
| "grad_norm": 0.013826651498675346, |
| "kl": 0.017964964429847897, |
| "learning_rate": 1.2689339106741529e-07, |
| "loss": 0.0007, |
| "reward": 0.821875, |
| "reward_std": 0.2382724992930889, |
| "rewards/custom_reward_simplified_v7_dblog": 0.821875, |
| "step": 2180 |
| }, |
| { |
| "completion_length": 574.075, |
| "epoch": 0.01744172155366714, |
| "grad_norm": 0.21891085803508759, |
| "kl": 0.013470867811702193, |
| "learning_rate": 1.1570762312943295e-07, |
| "loss": 0.0005, |
| "reward": 0.9875, |
| "reward_std": 0.2131643146276474, |
| "rewards/custom_reward_simplified_v7_dblog": 0.9875, |
| "step": 2190 |
| }, |
| { |
| "completion_length": 645.95, |
| "epoch": 0.01752136411783914, |
| "grad_norm": 0.28153711557388306, |
| "kl": 0.01899058516137302, |
| "learning_rate": 1.0502621921127776e-07, |
| "loss": 0.0008, |
| "reward": 0.834375, |
| "reward_std": 0.29732906967401507, |
| "rewards/custom_reward_simplified_v7_dblog": 0.834375, |
| "step": 2200 |
| }, |
| { |
| "completion_length": 618.19375, |
| "epoch": 0.017601006682011134, |
| "grad_norm": 0.25354552268981934, |
| "kl": 0.016854454204440115, |
| "learning_rate": 9.485143881596715e-08, |
| "loss": 0.0007, |
| "reward": 0.85625, |
| "reward_std": 0.25810291022062304, |
| "rewards/custom_reward_simplified_v7_dblog": 0.85625, |
| "step": 2210 |
| }, |
| { |
| "completion_length": 638.425, |
| "epoch": 0.01768064924618313, |
| "grad_norm": 0.2272520810365677, |
| "kl": 0.018312370544299482, |
| "learning_rate": 8.518543427732951e-08, |
| "loss": 0.0007, |
| "reward": 0.753125, |
| "reward_std": 0.2212974861264229, |
| "rewards/custom_reward_simplified_v7_dblog": 0.753125, |
| "step": 2220 |
| }, |
| { |
| "completion_length": 695.54375, |
| "epoch": 0.017760291810355128, |
| "grad_norm": 0.27871131896972656, |
| "kl": 0.02111934470012784, |
| "learning_rate": 7.603025030471001e-08, |
| "loss": 0.0008, |
| "reward": 0.75, |
| "reward_std": 0.2767858363687992, |
| "rewards/custom_reward_simplified_v7_dblog": 0.75, |
| "step": 2230 |
| }, |
| { |
| "completion_length": 632.99375, |
| "epoch": 0.017839934374527123, |
| "grad_norm": 0.008834543637931347, |
| "kl": 0.016428270121105017, |
| "learning_rate": 6.738782355044048e-08, |
| "loss": 0.0007, |
| "reward": 0.80625, |
| "reward_std": 0.21589626967906952, |
| "rewards/custom_reward_simplified_v7_dblog": 0.80625, |
| "step": 2240 |
| }, |
| { |
| "completion_length": 634.0625, |
| "epoch": 0.017919576938699118, |
| "grad_norm": 0.286683052778244, |
| "kl": 0.016679517249576746, |
| "learning_rate": 5.92599822001666e-08, |
| "loss": 0.0007, |
| "reward": 0.853125, |
| "reward_std": 0.2905955038964748, |
| "rewards/custom_reward_simplified_v7_dblog": 0.853125, |
| "step": 2250 |
| }, |
| { |
| "completion_length": 610.85, |
| "epoch": 0.017999219502871113, |
| "grad_norm": 0.28028422594070435, |
| "kl": 0.017966749798506498, |
| "learning_rate": 5.164844558612131e-08, |
| "loss": 0.0007, |
| "reward": 0.971875, |
| "reward_std": 0.3067967638373375, |
| "rewards/custom_reward_simplified_v7_dblog": 0.971875, |
| "step": 2260 |
| }, |
| { |
| "completion_length": 566.9625, |
| "epoch": 0.018078862067043112, |
| "grad_norm": 0.3413483202457428, |
| "kl": 0.01525729293935001, |
| "learning_rate": 4.455482382342336e-08, |
| "loss": 0.0006, |
| "reward": 0.959375, |
| "reward_std": 0.3084723956882954, |
| "rewards/custom_reward_simplified_v7_dblog": 0.959375, |
| "step": 2270 |
| }, |
| { |
| "completion_length": 662.05625, |
| "epoch": 0.018158504631215107, |
| "grad_norm": 0.153013676404953, |
| "kl": 0.017893880722112954, |
| "learning_rate": 3.798061746947995e-08, |
| "loss": 0.0007, |
| "reward": 0.753125, |
| "reward_std": 0.2308400221168995, |
| "rewards/custom_reward_simplified_v7_dblog": 0.753125, |
| "step": 2280 |
| }, |
| { |
| "completion_length": 615.55, |
| "epoch": 0.018238147195387102, |
| "grad_norm": 0.2853679060935974, |
| "kl": 0.0166370629100129, |
| "learning_rate": 3.1927217206564884e-08, |
| "loss": 0.0007, |
| "reward": 0.74375, |
| "reward_std": 0.25018117427825926, |
| "rewards/custom_reward_simplified_v7_dblog": 0.74375, |
| "step": 2290 |
| }, |
| { |
| "completion_length": 710.45, |
| "epoch": 0.018317789759559097, |
| "grad_norm": 0.011245607398450375, |
| "kl": 0.01835272475145757, |
| "learning_rate": 2.6395903547638825e-08, |
| "loss": 0.0007, |
| "reward": 0.78125, |
| "reward_std": 0.2881218962371349, |
| "rewards/custom_reward_simplified_v7_dblog": 0.78125, |
| "step": 2300 |
| }, |
| { |
| "completion_length": 565.125, |
| "epoch": 0.018397432323731096, |
| "grad_norm": 0.25337040424346924, |
| "kl": 0.01471406095661223, |
| "learning_rate": 2.1387846565474047e-08, |
| "loss": 0.0006, |
| "reward": 1.078125, |
| "reward_std": 0.4393742740154266, |
| "rewards/custom_reward_simplified_v7_dblog": 1.078125, |
| "step": 2310 |
| }, |
| { |
| "completion_length": 692.70625, |
| "epoch": 0.01847707488790309, |
| "grad_norm": 0.20416221022605896, |
| "kl": 0.02145941834896803, |
| "learning_rate": 1.6904105645142443e-08, |
| "loss": 0.0009, |
| "reward": 0.5625, |
| "reward_std": 0.1467035911977291, |
| "rewards/custom_reward_simplified_v7_dblog": 0.5625, |
| "step": 2320 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 2400, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 25, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|