smirki's picture
Training in progress, step 2325, checkpoint
74bf4c4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.01851689616998909,
"eval_steps": 500,
"global_step": 2325,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 714.1125,
"epoch": 7.964256417199609e-05,
"grad_norm": 0.11108597368001938,
"kl": 0.0005961419927189126,
"learning_rate": 2.0833333333333333e-07,
"loss": 0.0,
"reward": 0.540625,
"reward_std": 0.29713641852140427,
"rewards/custom_reward_simplified_v7_dblog": 0.540625,
"step": 10
},
{
"completion_length": 800.6625,
"epoch": 0.00015928512834399218,
"grad_norm": 0.1964382529258728,
"kl": 0.0007280149788130075,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.0,
"reward": 0.496875,
"reward_std": 0.25719649270176886,
"rewards/custom_reward_simplified_v7_dblog": 0.496875,
"step": 20
},
{
"completion_length": 750.46875,
"epoch": 0.00023892769251598824,
"grad_norm": 0.15792745351791382,
"kl": 0.0007828957575839012,
"learning_rate": 6.25e-07,
"loss": 0.0,
"reward": 0.684375,
"reward_std": 0.3755971297621727,
"rewards/custom_reward_simplified_v7_dblog": 0.684375,
"step": 30
},
{
"completion_length": 813.94375,
"epoch": 0.00031857025668798435,
"grad_norm": 0.12503573298454285,
"kl": 0.0007155703555326909,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0,
"reward": 0.565625,
"reward_std": 0.2761854581534863,
"rewards/custom_reward_simplified_v7_dblog": 0.565625,
"step": 40
},
{
"completion_length": 747.675,
"epoch": 0.0003982128208599804,
"grad_norm": 0.10329681634902954,
"kl": 0.0007686431898036971,
"learning_rate": 1.0416666666666667e-06,
"loss": 0.0,
"reward": 0.621875,
"reward_std": 0.30715219378471376,
"rewards/custom_reward_simplified_v7_dblog": 0.621875,
"step": 50
},
{
"completion_length": 821.60625,
"epoch": 0.0004778553850319765,
"grad_norm": 0.1834840029478073,
"kl": 0.0007538022648077459,
"learning_rate": 1.25e-06,
"loss": 0.0,
"reward": 0.578125,
"reward_std": 0.39505376294255257,
"rewards/custom_reward_simplified_v7_dblog": 0.578125,
"step": 60
},
{
"completion_length": 776.75,
"epoch": 0.0005574979492039726,
"grad_norm": 0.11483483016490936,
"kl": 0.0007510531373554841,
"learning_rate": 1.4583333333333335e-06,
"loss": 0.0,
"reward": 0.584375,
"reward_std": 0.32483330443501474,
"rewards/custom_reward_simplified_v7_dblog": 0.584375,
"step": 70
},
{
"completion_length": 804.675,
"epoch": 0.0006371405133759687,
"grad_norm": 0.17995329201221466,
"kl": 0.0007302156562218442,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0,
"reward": 0.703125,
"reward_std": 0.32263160347938535,
"rewards/custom_reward_simplified_v7_dblog": 0.703125,
"step": 80
},
{
"completion_length": 793.0875,
"epoch": 0.0007167830775479647,
"grad_norm": 0.16513389348983765,
"kl": 0.0007239854254294187,
"learning_rate": 1.8750000000000003e-06,
"loss": 0.0,
"reward": 0.709375,
"reward_std": 0.3102527566254139,
"rewards/custom_reward_simplified_v7_dblog": 0.709375,
"step": 90
},
{
"completion_length": 812.0,
"epoch": 0.0007964256417199608,
"grad_norm": 0.1802467256784439,
"kl": 0.0007639184041181579,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.0,
"reward": 0.528125,
"reward_std": 0.21242836564779283,
"rewards/custom_reward_simplified_v7_dblog": 0.528125,
"step": 100
},
{
"completion_length": 784.64375,
"epoch": 0.0008760682058919569,
"grad_norm": 0.17609436810016632,
"kl": 0.0007660316972760483,
"learning_rate": 2.2916666666666666e-06,
"loss": 0.0,
"reward": 0.565625,
"reward_std": 0.3309394560754299,
"rewards/custom_reward_simplified_v7_dblog": 0.565625,
"step": 110
},
{
"completion_length": 717.24375,
"epoch": 0.000955710770063953,
"grad_norm": 0.14550578594207764,
"kl": 0.0007782038446748629,
"learning_rate": 2.5e-06,
"loss": 0.0,
"reward": 0.728125,
"reward_std": 0.2573545627295971,
"rewards/custom_reward_simplified_v7_dblog": 0.728125,
"step": 120
},
{
"completion_length": 872.6375,
"epoch": 0.001035353334235949,
"grad_norm": 0.11807532608509064,
"kl": 0.0007370044564595446,
"learning_rate": 2.7083333333333334e-06,
"loss": 0.0,
"reward": 0.45,
"reward_std": 0.24368184804916382,
"rewards/custom_reward_simplified_v7_dblog": 0.45,
"step": 130
},
{
"completion_length": 780.325,
"epoch": 0.0011149958984079452,
"grad_norm": 0.21067936718463898,
"kl": 0.0007969280297402293,
"learning_rate": 2.916666666666667e-06,
"loss": 0.0,
"reward": 0.671875,
"reward_std": 0.3312204420566559,
"rewards/custom_reward_simplified_v7_dblog": 0.671875,
"step": 140
},
{
"completion_length": 796.15625,
"epoch": 0.0011946384625799412,
"grad_norm": 0.11178277432918549,
"kl": 0.0007584215141832829,
"learning_rate": 3.125e-06,
"loss": 0.0,
"reward": 0.675,
"reward_std": 0.2411833107471466,
"rewards/custom_reward_simplified_v7_dblog": 0.675,
"step": 150
},
{
"completion_length": 735.4375,
"epoch": 0.0012742810267519374,
"grad_norm": 0.12408847361803055,
"kl": 0.0008089728711638599,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0,
"reward": 0.5875,
"reward_std": 0.2907834567129612,
"rewards/custom_reward_simplified_v7_dblog": 0.5875,
"step": 160
},
{
"completion_length": 630.76875,
"epoch": 0.0013539235909239334,
"grad_norm": 0.14481835067272186,
"kl": 0.0008351787488209084,
"learning_rate": 3.5416666666666673e-06,
"loss": 0.0,
"reward": 0.828125,
"reward_std": 0.3232325129210949,
"rewards/custom_reward_simplified_v7_dblog": 0.828125,
"step": 170
},
{
"completion_length": 704.2,
"epoch": 0.0014335661550959294,
"grad_norm": 0.22581899166107178,
"kl": 0.0008706353197339922,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0,
"reward": 0.621875,
"reward_std": 0.2438264600932598,
"rewards/custom_reward_simplified_v7_dblog": 0.621875,
"step": 180
},
{
"completion_length": 738.2625,
"epoch": 0.0015132087192679256,
"grad_norm": 0.20901009440422058,
"kl": 0.000852665287675336,
"learning_rate": 3.958333333333333e-06,
"loss": 0.0,
"reward": 0.659375,
"reward_std": 0.2661551833152771,
"rewards/custom_reward_simplified_v7_dblog": 0.659375,
"step": 190
},
{
"completion_length": 773.31875,
"epoch": 0.0015928512834399217,
"grad_norm": 0.14023999869823456,
"kl": 0.0008427878346992657,
"learning_rate": 4.166666666666667e-06,
"loss": 0.0,
"reward": 0.575,
"reward_std": 0.263551290333271,
"rewards/custom_reward_simplified_v7_dblog": 0.575,
"step": 200
},
{
"completion_length": 760.4,
"epoch": 0.0016724938476119177,
"grad_norm": 0.15415024757385254,
"kl": 0.0009272771596442908,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.0,
"reward": 0.578125,
"reward_std": 0.3055797599256039,
"rewards/custom_reward_simplified_v7_dblog": 0.578125,
"step": 210
},
{
"completion_length": 824.94375,
"epoch": 0.0017521364117839139,
"grad_norm": 0.18523605167865753,
"kl": 0.0008898543601389974,
"learning_rate": 4.583333333333333e-06,
"loss": 0.0,
"reward": 0.58125,
"reward_std": 0.2951655209064484,
"rewards/custom_reward_simplified_v7_dblog": 0.58125,
"step": 220
},
{
"completion_length": 748.5375,
"epoch": 0.0018317789759559099,
"grad_norm": 0.11306847631931305,
"kl": 0.0009787698683794588,
"learning_rate": 4.791666666666668e-06,
"loss": 0.0,
"reward": 0.590625,
"reward_std": 0.2887454777956009,
"rewards/custom_reward_simplified_v7_dblog": 0.590625,
"step": 230
},
{
"completion_length": 753.39375,
"epoch": 0.001911421540127906,
"grad_norm": 0.0014718669699504972,
"kl": 0.0010118414385942743,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.55625,
"reward_std": 0.1977315753698349,
"rewards/custom_reward_simplified_v7_dblog": 0.55625,
"step": 240
},
{
"completion_length": 812.71875,
"epoch": 0.001991064104299902,
"grad_norm": 0.11223085969686508,
"kl": 0.0010390775743871928,
"learning_rate": 4.999735579817769e-06,
"loss": 0.0,
"reward": 0.6875,
"reward_std": 0.24264758601784706,
"rewards/custom_reward_simplified_v7_dblog": 0.6875,
"step": 250
},
{
"completion_length": 731.66875,
"epoch": 0.002070706668471898,
"grad_norm": 0.1944543570280075,
"kl": 0.001084678602637723,
"learning_rate": 4.998942375205502e-06,
"loss": 0.0,
"reward": 0.796875,
"reward_std": 0.31279500126838683,
"rewards/custom_reward_simplified_v7_dblog": 0.796875,
"step": 260
},
{
"completion_length": 770.69375,
"epoch": 0.0021503492326438944,
"grad_norm": 0.10904921591281891,
"kl": 0.0012701354396995157,
"learning_rate": 4.997620553954645e-06,
"loss": 0.0001,
"reward": 0.653125,
"reward_std": 0.1583670809864998,
"rewards/custom_reward_simplified_v7_dblog": 0.653125,
"step": 270
},
{
"completion_length": 646.63125,
"epoch": 0.0022299917968158904,
"grad_norm": 0.11776451766490936,
"kl": 0.0026803009008290247,
"learning_rate": 4.995770395678171e-06,
"loss": 0.0001,
"reward": 0.78125,
"reward_std": 0.37105962783098223,
"rewards/custom_reward_simplified_v7_dblog": 0.78125,
"step": 280
},
{
"completion_length": 850.5125,
"epoch": 0.0023096343609878864,
"grad_norm": 0.17029190063476562,
"kl": 0.0011812534503405914,
"learning_rate": 4.993392291751431e-06,
"loss": 0.0,
"reward": 0.503125,
"reward_std": 0.2320079453289509,
"rewards/custom_reward_simplified_v7_dblog": 0.503125,
"step": 290
},
{
"completion_length": 774.4125,
"epoch": 0.0023892769251598824,
"grad_norm": 0.17417992651462555,
"kl": 0.001456298804259859,
"learning_rate": 4.990486745229364e-06,
"loss": 0.0001,
"reward": 0.621875,
"reward_std": 0.24568462520837783,
"rewards/custom_reward_simplified_v7_dblog": 0.621875,
"step": 300
},
{
"completion_length": 733.675,
"epoch": 0.0024689194893318784,
"grad_norm": 0.13222694396972656,
"kl": 0.001577114372048527,
"learning_rate": 4.9870543707400835e-06,
"loss": 0.0001,
"reward": 0.653125,
"reward_std": 0.27914761677384375,
"rewards/custom_reward_simplified_v7_dblog": 0.653125,
"step": 310
},
{
"completion_length": 711.91875,
"epoch": 0.002548562053503875,
"grad_norm": 0.19241130352020264,
"kl": 0.0017230566183570773,
"learning_rate": 4.983095894354858e-06,
"loss": 0.0001,
"reward": 0.68125,
"reward_std": 0.3178554192185402,
"rewards/custom_reward_simplified_v7_dblog": 0.68125,
"step": 320
},
{
"completion_length": 780.34375,
"epoch": 0.002628204617675871,
"grad_norm": 0.1997414082288742,
"kl": 0.002029248425969854,
"learning_rate": 4.978612153434527e-06,
"loss": 0.0001,
"reward": 0.696875,
"reward_std": 0.32896072417497635,
"rewards/custom_reward_simplified_v7_dblog": 0.696875,
"step": 330
},
{
"completion_length": 695.63125,
"epoch": 0.002707847181847867,
"grad_norm": 0.18966233730316162,
"kl": 0.002277573832543567,
"learning_rate": 4.973604096452361e-06,
"loss": 0.0001,
"reward": 0.684375,
"reward_std": 0.2995404839515686,
"rewards/custom_reward_simplified_v7_dblog": 0.684375,
"step": 340
},
{
"completion_length": 719.425,
"epoch": 0.002787489746019863,
"grad_norm": 0.17769980430603027,
"kl": 0.002305405435618013,
"learning_rate": 4.968072782793436e-06,
"loss": 0.0001,
"reward": 0.74375,
"reward_std": 0.3807508498430252,
"rewards/custom_reward_simplified_v7_dblog": 0.74375,
"step": 350
},
{
"completion_length": 732.4375,
"epoch": 0.002867132310191859,
"grad_norm": 0.21898534893989563,
"kl": 0.002607938600704074,
"learning_rate": 4.962019382530521e-06,
"loss": 0.0001,
"reward": 0.596875,
"reward_std": 0.303117785602808,
"rewards/custom_reward_simplified_v7_dblog": 0.596875,
"step": 360
},
{
"completion_length": 703.21875,
"epoch": 0.002946774874363855,
"grad_norm": 0.20463985204696655,
"kl": 0.0030091375578194858,
"learning_rate": 4.955445176176577e-06,
"loss": 0.0001,
"reward": 0.746875,
"reward_std": 0.28880608528852464,
"rewards/custom_reward_simplified_v7_dblog": 0.746875,
"step": 370
},
{
"completion_length": 646.95625,
"epoch": 0.0030264174385358513,
"grad_norm": 0.17787523567676544,
"kl": 0.003602780296932906,
"learning_rate": 4.948351554413879e-06,
"loss": 0.0001,
"reward": 0.753125,
"reward_std": 0.31493050456047056,
"rewards/custom_reward_simplified_v7_dblog": 0.753125,
"step": 380
},
{
"completion_length": 656.94375,
"epoch": 0.0031060600027078473,
"grad_norm": 0.18550129234790802,
"kl": 0.003282526368275285,
"learning_rate": 4.9407400177998335e-06,
"loss": 0.0001,
"reward": 0.828125,
"reward_std": 0.33323406875133516,
"rewards/custom_reward_simplified_v7_dblog": 0.828125,
"step": 390
},
{
"completion_length": 740.3125,
"epoch": 0.0031857025668798433,
"grad_norm": 0.19987954199314117,
"kl": 0.003102585405576974,
"learning_rate": 4.93261217644956e-06,
"loss": 0.0001,
"reward": 0.590625,
"reward_std": 0.26303397938609124,
"rewards/custom_reward_simplified_v7_dblog": 0.590625,
"step": 400
},
{
"completion_length": 641.26875,
"epoch": 0.0032653451310518393,
"grad_norm": 0.21161562204360962,
"kl": 0.003351045388262719,
"learning_rate": 4.9239697496952904e-06,
"loss": 0.0001,
"reward": 0.909375,
"reward_std": 0.3579762116074562,
"rewards/custom_reward_simplified_v7_dblog": 0.909375,
"step": 410
},
{
"completion_length": 692.06875,
"epoch": 0.0033449876952238353,
"grad_norm": 0.17584940791130066,
"kl": 0.003339459316339344,
"learning_rate": 4.914814565722671e-06,
"loss": 0.0001,
"reward": 0.765625,
"reward_std": 0.3109076008200645,
"rewards/custom_reward_simplified_v7_dblog": 0.765625,
"step": 420
},
{
"completion_length": 638.51875,
"epoch": 0.0034246302593958313,
"grad_norm": 0.17778904736042023,
"kl": 0.0034626491484232246,
"learning_rate": 4.905148561184033e-06,
"loss": 0.0001,
"reward": 0.671875,
"reward_std": 0.2665500298142433,
"rewards/custom_reward_simplified_v7_dblog": 0.671875,
"step": 430
},
{
"completion_length": 597.475,
"epoch": 0.0035042728235678278,
"grad_norm": 0.127123162150383,
"kl": 0.0039646215736866,
"learning_rate": 4.894973780788722e-06,
"loss": 0.0002,
"reward": 0.88125,
"reward_std": 0.28942874893546106,
"rewards/custom_reward_simplified_v7_dblog": 0.88125,
"step": 440
},
{
"completion_length": 651.6625,
"epoch": 0.0035839153877398238,
"grad_norm": 0.21087874472141266,
"kl": 0.004210945626255125,
"learning_rate": 4.884292376870567e-06,
"loss": 0.0002,
"reward": 0.753125,
"reward_std": 0.29777742698788645,
"rewards/custom_reward_simplified_v7_dblog": 0.753125,
"step": 450
},
{
"completion_length": 727.13125,
"epoch": 0.0036635579519118198,
"grad_norm": 0.18630079925060272,
"kl": 0.003935616160742938,
"learning_rate": 4.873106608932585e-06,
"loss": 0.0002,
"reward": 0.678125,
"reward_std": 0.31932896226644514,
"rewards/custom_reward_simplified_v7_dblog": 0.678125,
"step": 460
},
{
"completion_length": 716.74375,
"epoch": 0.003743200516083816,
"grad_norm": 0.1637570858001709,
"kl": 0.004373999196104705,
"learning_rate": 4.861418843169012e-06,
"loss": 0.0002,
"reward": 0.646875,
"reward_std": 0.26624983847141265,
"rewards/custom_reward_simplified_v7_dblog": 0.646875,
"step": 470
},
{
"completion_length": 581.90625,
"epoch": 0.003822843080255812,
"grad_norm": 0.0051241409964859486,
"kl": 0.004909415659494698,
"learning_rate": 4.849231551964771e-06,
"loss": 0.0002,
"reward": 0.75625,
"reward_std": 0.19474873542785645,
"rewards/custom_reward_simplified_v7_dblog": 0.75625,
"step": 480
},
{
"completion_length": 680.94375,
"epoch": 0.003902485644427808,
"grad_norm": 0.15670013427734375,
"kl": 0.004694941581692547,
"learning_rate": 4.836547313372472e-06,
"loss": 0.0002,
"reward": 0.73125,
"reward_std": 0.2675834000110626,
"rewards/custom_reward_simplified_v7_dblog": 0.73125,
"step": 490
},
{
"completion_length": 699.1,
"epoch": 0.003982128208599804,
"grad_norm": 0.1365301012992859,
"kl": 0.00405421577161178,
"learning_rate": 4.823368810567056e-06,
"loss": 0.0002,
"reward": 0.603125,
"reward_std": 0.25718758851289747,
"rewards/custom_reward_simplified_v7_dblog": 0.603125,
"step": 500
},
{
"completion_length": 646.7,
"epoch": 0.0040617707727718,
"grad_norm": 0.14925876259803772,
"kl": 0.003934591950383037,
"learning_rate": 4.809698831278217e-06,
"loss": 0.0002,
"reward": 0.734375,
"reward_std": 0.2696119427680969,
"rewards/custom_reward_simplified_v7_dblog": 0.734375,
"step": 510
},
{
"completion_length": 726.08125,
"epoch": 0.004141413336943796,
"grad_norm": 0.2107785940170288,
"kl": 0.004233359964564443,
"learning_rate": 4.7955402672006855e-06,
"loss": 0.0002,
"reward": 0.759375,
"reward_std": 0.2953102938830853,
"rewards/custom_reward_simplified_v7_dblog": 0.759375,
"step": 520
},
{
"completion_length": 633.525,
"epoch": 0.004221055901115793,
"grad_norm": 0.2159271538257599,
"kl": 0.004929024970624596,
"learning_rate": 4.780896113382536e-06,
"loss": 0.0002,
"reward": 0.75625,
"reward_std": 0.2647860750555992,
"rewards/custom_reward_simplified_v7_dblog": 0.75625,
"step": 530
},
{
"completion_length": 586.9125,
"epoch": 0.004300698465287789,
"grad_norm": 0.2394983470439911,
"kl": 0.004724201350472868,
"learning_rate": 4.765769467591626e-06,
"loss": 0.0002,
"reward": 0.975,
"reward_std": 0.36022927314043046,
"rewards/custom_reward_simplified_v7_dblog": 0.975,
"step": 540
},
{
"completion_length": 651.88125,
"epoch": 0.004380341029459785,
"grad_norm": 0.1552504301071167,
"kl": 0.004269527771975845,
"learning_rate": 4.750163529660303e-06,
"loss": 0.0002,
"reward": 0.790625,
"reward_std": 0.2759058982133865,
"rewards/custom_reward_simplified_v7_dblog": 0.790625,
"step": 550
},
{
"completion_length": 655.9125,
"epoch": 0.004459983593631781,
"grad_norm": 0.13005749881267548,
"kl": 0.004541868972592056,
"learning_rate": 4.734081600808531e-06,
"loss": 0.0002,
"reward": 0.796875,
"reward_std": 0.2369130529463291,
"rewards/custom_reward_simplified_v7_dblog": 0.796875,
"step": 560
},
{
"completion_length": 630.3375,
"epoch": 0.004539626157803777,
"grad_norm": 0.14732114970684052,
"kl": 0.004577037692070007,
"learning_rate": 4.717527082945555e-06,
"loss": 0.0002,
"reward": 0.925,
"reward_std": 0.3310479797422886,
"rewards/custom_reward_simplified_v7_dblog": 0.925,
"step": 570
},
{
"completion_length": 693.2625,
"epoch": 0.004619268721975773,
"grad_norm": 0.11388376355171204,
"kl": 0.004154781624674797,
"learning_rate": 4.700503477950278e-06,
"loss": 0.0002,
"reward": 0.6875,
"reward_std": 0.29332098439335824,
"rewards/custom_reward_simplified_v7_dblog": 0.6875,
"step": 580
},
{
"completion_length": 662.7625,
"epoch": 0.004698911286147769,
"grad_norm": 0.15470421314239502,
"kl": 0.00541011628229171,
"learning_rate": 4.6830143869304904e-06,
"loss": 0.0002,
"reward": 0.809375,
"reward_std": 0.32753978818655016,
"rewards/custom_reward_simplified_v7_dblog": 0.809375,
"step": 590
},
{
"completion_length": 698.95625,
"epoch": 0.004778553850319765,
"grad_norm": 0.004228990990668535,
"kl": 0.004637495230417699,
"learning_rate": 4.665063509461098e-06,
"loss": 0.0002,
"reward": 0.75,
"reward_std": 0.23772156983613968,
"rewards/custom_reward_simplified_v7_dblog": 0.75,
"step": 600
},
{
"completion_length": 629.7625,
"epoch": 0.004858196414491761,
"grad_norm": 0.21860064566135406,
"kl": 0.0044788535917177795,
"learning_rate": 4.646654642801533e-06,
"loss": 0.0002,
"reward": 0.8125,
"reward_std": 0.27716630697250366,
"rewards/custom_reward_simplified_v7_dblog": 0.8125,
"step": 610
},
{
"completion_length": 727.06875,
"epoch": 0.004937838978663757,
"grad_norm": 0.1765265315771103,
"kl": 0.004957099666353315,
"learning_rate": 4.627791681092499e-06,
"loss": 0.0002,
"reward": 0.6,
"reward_std": 0.2689620770514011,
"rewards/custom_reward_simplified_v7_dblog": 0.6,
"step": 620
},
{
"completion_length": 718.825,
"epoch": 0.005017481542835753,
"grad_norm": 0.12771090865135193,
"kl": 0.005165508517529815,
"learning_rate": 4.608478614532215e-06,
"loss": 0.0002,
"reward": 0.728125,
"reward_std": 0.3053886480629444,
"rewards/custom_reward_simplified_v7_dblog": 0.728125,
"step": 630
},
{
"completion_length": 629.525,
"epoch": 0.00509712410700775,
"grad_norm": 0.17840693891048431,
"kl": 0.005059469246771186,
"learning_rate": 4.588719528532342e-06,
"loss": 0.0002,
"reward": 0.721875,
"reward_std": 0.298052953928709,
"rewards/custom_reward_simplified_v7_dblog": 0.721875,
"step": 640
},
{
"completion_length": 668.68125,
"epoch": 0.005176766671179746,
"grad_norm": 0.12746350467205048,
"kl": 0.004331990797072649,
"learning_rate": 4.568518602853776e-06,
"loss": 0.0002,
"reward": 0.746875,
"reward_std": 0.22913563549518584,
"rewards/custom_reward_simplified_v7_dblog": 0.746875,
"step": 650
},
{
"completion_length": 734.9875,
"epoch": 0.005256409235351742,
"grad_norm": 0.19717195630073547,
"kl": 0.00479215239174664,
"learning_rate": 4.54788011072248e-06,
"loss": 0.0002,
"reward": 0.784375,
"reward_std": 0.4230809181928635,
"rewards/custom_reward_simplified_v7_dblog": 0.784375,
"step": 660
},
{
"completion_length": 658.29375,
"epoch": 0.005336051799523738,
"grad_norm": 0.2698514759540558,
"kl": 0.004821322776842862,
"learning_rate": 4.526808417925531e-06,
"loss": 0.0002,
"reward": 0.81875,
"reward_std": 0.26030006259679794,
"rewards/custom_reward_simplified_v7_dblog": 0.81875,
"step": 670
},
{
"completion_length": 696.30625,
"epoch": 0.005415694363695734,
"grad_norm": 0.2144252061843872,
"kl": 0.005292760988231749,
"learning_rate": 4.50530798188761e-06,
"loss": 0.0002,
"reward": 0.609375,
"reward_std": 0.2595392823219299,
"rewards/custom_reward_simplified_v7_dblog": 0.609375,
"step": 680
},
{
"completion_length": 696.99375,
"epoch": 0.00549533692786773,
"grad_norm": 0.006262101698666811,
"kl": 0.005413674132432789,
"learning_rate": 4.4833833507280884e-06,
"loss": 0.0002,
"reward": 0.684375,
"reward_std": 0.24843912497162818,
"rewards/custom_reward_simplified_v7_dblog": 0.684375,
"step": 690
},
{
"completion_length": 675.50625,
"epoch": 0.005574979492039726,
"grad_norm": 0.16301825642585754,
"kl": 0.005892223375849426,
"learning_rate": 4.46103916229894e-06,
"loss": 0.0002,
"reward": 0.80625,
"reward_std": 0.34091843143105505,
"rewards/custom_reward_simplified_v7_dblog": 0.80625,
"step": 700
},
{
"completion_length": 725.675,
"epoch": 0.005654622056211722,
"grad_norm": 0.18473494052886963,
"kl": 0.005652935197576881,
"learning_rate": 4.438280143203665e-06,
"loss": 0.0002,
"reward": 0.66875,
"reward_std": 0.216452856361866,
"rewards/custom_reward_simplified_v7_dblog": 0.66875,
"step": 710
},
{
"completion_length": 764.26875,
"epoch": 0.005734264620383718,
"grad_norm": 0.17735017836093903,
"kl": 0.005824547982774675,
"learning_rate": 4.415111107797445e-06,
"loss": 0.0002,
"reward": 0.634375,
"reward_std": 0.25477964654564855,
"rewards/custom_reward_simplified_v7_dblog": 0.634375,
"step": 720
},
{
"completion_length": 607.86875,
"epoch": 0.005813907184555714,
"grad_norm": 0.20680995285511017,
"kl": 0.0055589195340871814,
"learning_rate": 4.391536957168733e-06,
"loss": 0.0002,
"reward": 0.8,
"reward_std": 0.32480863481760025,
"rewards/custom_reward_simplified_v7_dblog": 0.8,
"step": 730
},
{
"completion_length": 674.13125,
"epoch": 0.00589354974872771,
"grad_norm": 0.005594769027084112,
"kl": 0.005972519854549318,
"learning_rate": 4.367562678102491e-06,
"loss": 0.0002,
"reward": 0.665625,
"reward_std": 0.20820673778653145,
"rewards/custom_reward_simplified_v7_dblog": 0.665625,
"step": 740
},
{
"completion_length": 639.69375,
"epoch": 0.005973192312899706,
"grad_norm": 0.11012833565473557,
"kl": 0.005814655229914934,
"learning_rate": 4.34319334202531e-06,
"loss": 0.0002,
"reward": 0.796875,
"reward_std": 0.34761993661522866,
"rewards/custom_reward_simplified_v7_dblog": 0.796875,
"step": 750
},
{
"completion_length": 587.6,
"epoch": 0.006052834877071703,
"grad_norm": 0.2750849723815918,
"kl": 0.006217251974157989,
"learning_rate": 4.318434103932622e-06,
"loss": 0.0002,
"reward": 0.75625,
"reward_std": 0.23903784826397895,
"rewards/custom_reward_simplified_v7_dblog": 0.75625,
"step": 760
},
{
"completion_length": 691.45625,
"epoch": 0.006132477441243699,
"grad_norm": 0.12792551517486572,
"kl": 0.005762395297642798,
"learning_rate": 4.293290201298224e-06,
"loss": 0.0002,
"reward": 0.65,
"reward_std": 0.282283828407526,
"rewards/custom_reward_simplified_v7_dblog": 0.65,
"step": 770
},
{
"completion_length": 634.79375,
"epoch": 0.006212120005415695,
"grad_norm": 0.11762549728155136,
"kl": 0.005472023575566709,
"learning_rate": 4.267766952966369e-06,
"loss": 0.0002,
"reward": 0.878125,
"reward_std": 0.31506996527314185,
"rewards/custom_reward_simplified_v7_dblog": 0.878125,
"step": 780
},
{
"completion_length": 719.05625,
"epoch": 0.006291762569587691,
"grad_norm": 0.0052847606129944324,
"kl": 0.006504135020077228,
"learning_rate": 4.241869758026638e-06,
"loss": 0.0003,
"reward": 0.628125,
"reward_std": 0.2685270056128502,
"rewards/custom_reward_simplified_v7_dblog": 0.628125,
"step": 790
},
{
"completion_length": 699.19375,
"epoch": 0.006371405133759687,
"grad_norm": 0.2003583461046219,
"kl": 0.005931918846908957,
"learning_rate": 4.215604094671835e-06,
"loss": 0.0002,
"reward": 0.746875,
"reward_std": 0.25832219421863556,
"rewards/custom_reward_simplified_v7_dblog": 0.746875,
"step": 800
},
{
"completion_length": 652.925,
"epoch": 0.006451047697931683,
"grad_norm": 0.0062674470245838165,
"kl": 0.006221415114123374,
"learning_rate": 4.188975519039151e-06,
"loss": 0.0002,
"reward": 0.73125,
"reward_std": 0.3172403134405613,
"rewards/custom_reward_simplified_v7_dblog": 0.73125,
"step": 810
},
{
"completion_length": 668.63125,
"epoch": 0.006530690262103679,
"grad_norm": 0.13624051213264465,
"kl": 0.0063671735813841225,
"learning_rate": 4.161989664034844e-06,
"loss": 0.0003,
"reward": 0.684375,
"reward_std": 0.24903304055333136,
"rewards/custom_reward_simplified_v7_dblog": 0.684375,
"step": 820
},
{
"completion_length": 658.575,
"epoch": 0.006610332826275675,
"grad_norm": 0.2923766076564789,
"kl": 0.0068331335205584764,
"learning_rate": 4.134652238142674e-06,
"loss": 0.0003,
"reward": 0.73125,
"reward_std": 0.3243869088590145,
"rewards/custom_reward_simplified_v7_dblog": 0.73125,
"step": 830
},
{
"completion_length": 645.31875,
"epoch": 0.006689975390447671,
"grad_norm": 0.22414511442184448,
"kl": 0.006329123536124826,
"learning_rate": 4.106969024216348e-06,
"loss": 0.0003,
"reward": 0.728125,
"reward_std": 0.2578707054257393,
"rewards/custom_reward_simplified_v7_dblog": 0.728125,
"step": 840
},
{
"completion_length": 620.76875,
"epoch": 0.006769617954619667,
"grad_norm": 0.2500353455543518,
"kl": 0.006427089823409915,
"learning_rate": 4.078945878256244e-06,
"loss": 0.0003,
"reward": 0.85625,
"reward_std": 0.3704014003276825,
"rewards/custom_reward_simplified_v7_dblog": 0.85625,
"step": 850
},
{
"completion_length": 545.075,
"epoch": 0.006849260518791663,
"grad_norm": 0.18576188385486603,
"kl": 0.005737546656746417,
"learning_rate": 4.0505887281706505e-06,
"loss": 0.0002,
"reward": 0.9125,
"reward_std": 0.27787805944681165,
"rewards/custom_reward_simplified_v7_dblog": 0.9125,
"step": 860
},
{
"completion_length": 671.1375,
"epoch": 0.0069289030829636595,
"grad_norm": 0.27761420607566833,
"kl": 0.005926149617880583,
"learning_rate": 4.021903572521802e-06,
"loss": 0.0002,
"reward": 0.71875,
"reward_std": 0.1984293892979622,
"rewards/custom_reward_simplified_v7_dblog": 0.71875,
"step": 870
},
{
"completion_length": 591.325,
"epoch": 0.0070085456471356555,
"grad_norm": 0.12898898124694824,
"kl": 0.006013317289762199,
"learning_rate": 3.992896479256966e-06,
"loss": 0.0002,
"reward": 0.875,
"reward_std": 0.31373453289270403,
"rewards/custom_reward_simplified_v7_dblog": 0.875,
"step": 880
},
{
"completion_length": 709.64375,
"epoch": 0.0070881882113076515,
"grad_norm": 0.1858564019203186,
"kl": 0.006654553860425949,
"learning_rate": 3.963573584424852e-06,
"loss": 0.0003,
"reward": 0.875,
"reward_std": 0.40053595080971716,
"rewards/custom_reward_simplified_v7_dblog": 0.875,
"step": 890
},
{
"completion_length": 693.86875,
"epoch": 0.0071678307754796475,
"grad_norm": 0.23618744313716888,
"kl": 0.006588698271661997,
"learning_rate": 3.933941090877615e-06,
"loss": 0.0003,
"reward": 0.6875,
"reward_std": 0.22922600656747819,
"rewards/custom_reward_simplified_v7_dblog": 0.6875,
"step": 900
},
{
"completion_length": 655.1,
"epoch": 0.0072474733396516436,
"grad_norm": 0.18607589602470398,
"kl": 0.006554636568762362,
"learning_rate": 3.9040052669587325e-06,
"loss": 0.0003,
"reward": 0.79375,
"reward_std": 0.26788339093327523,
"rewards/custom_reward_simplified_v7_dblog": 0.79375,
"step": 910
},
{
"completion_length": 678.36875,
"epoch": 0.0073271159038236396,
"grad_norm": 0.15605397522449493,
"kl": 0.006827571708709001,
"learning_rate": 3.8737724451770155e-06,
"loss": 0.0003,
"reward": 0.74375,
"reward_std": 0.25242582634091376,
"rewards/custom_reward_simplified_v7_dblog": 0.74375,
"step": 920
},
{
"completion_length": 640.1875,
"epoch": 0.0074067584679956356,
"grad_norm": 0.22241215407848358,
"kl": 0.006700195767916739,
"learning_rate": 3.8432490208670605e-06,
"loss": 0.0003,
"reward": 0.753125,
"reward_std": 0.30004683434963225,
"rewards/custom_reward_simplified_v7_dblog": 0.753125,
"step": 930
},
{
"completion_length": 671.025,
"epoch": 0.007486401032167632,
"grad_norm": 0.2610742747783661,
"kl": 0.007203501905314625,
"learning_rate": 3.8124414508364005e-06,
"loss": 0.0003,
"reward": 0.696875,
"reward_std": 0.2809624969959259,
"rewards/custom_reward_simplified_v7_dblog": 0.696875,
"step": 940
},
{
"completion_length": 644.56875,
"epoch": 0.007566043596339628,
"grad_norm": 0.18431080877780914,
"kl": 0.006376700336113572,
"learning_rate": 3.7813562519996633e-06,
"loss": 0.0003,
"reward": 0.775,
"reward_std": 0.2690692335367203,
"rewards/custom_reward_simplified_v7_dblog": 0.775,
"step": 950
},
{
"completion_length": 706.7125,
"epoch": 0.007645686160511624,
"grad_norm": 0.11362796276807785,
"kl": 0.0065676989033818245,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0003,
"reward": 0.753125,
"reward_std": 0.3238763153553009,
"rewards/custom_reward_simplified_v7_dblog": 0.753125,
"step": 960
},
{
"completion_length": 591.60625,
"epoch": 0.00772532872468362,
"grad_norm": 0.006601857952773571,
"kl": 0.0061999865574762225,
"learning_rate": 3.7183793278181063e-06,
"loss": 0.0002,
"reward": 0.978125,
"reward_std": 0.32862835973501203,
"rewards/custom_reward_simplified_v7_dblog": 0.978125,
"step": 970
},
{
"completion_length": 623.40625,
"epoch": 0.007804971288855616,
"grad_norm": 0.24265889823436737,
"kl": 0.006443582929205149,
"learning_rate": 3.6865009243691015e-06,
"loss": 0.0003,
"reward": 0.790625,
"reward_std": 0.35499989837408064,
"rewards/custom_reward_simplified_v7_dblog": 0.790625,
"step": 980
},
{
"completion_length": 677.65625,
"epoch": 0.007884613853027612,
"grad_norm": 0.23094038665294647,
"kl": 0.006802499154582619,
"learning_rate": 3.654371533087586e-06,
"loss": 0.0003,
"reward": 0.80625,
"reward_std": 0.3126889310777187,
"rewards/custom_reward_simplified_v7_dblog": 0.80625,
"step": 990
},
{
"completion_length": 703.0125,
"epoch": 0.007964256417199608,
"grad_norm": 0.2269383817911148,
"kl": 0.006587388808839023,
"learning_rate": 3.621997950501156e-06,
"loss": 0.0003,
"reward": 0.83125,
"reward_std": 0.3684743233025074,
"rewards/custom_reward_simplified_v7_dblog": 0.83125,
"step": 1000
},
{
"completion_length": 702.75,
"epoch": 0.008043898981371604,
"grad_norm": 0.25571930408477783,
"kl": 0.0066094894893467425,
"learning_rate": 3.5893870247926986e-06,
"loss": 0.0003,
"reward": 0.690625,
"reward_std": 0.27608626931905744,
"rewards/custom_reward_simplified_v7_dblog": 0.690625,
"step": 1010
},
{
"completion_length": 634.18125,
"epoch": 0.0081235415455436,
"grad_norm": 0.006109423469752073,
"kl": 0.006831615581177175,
"learning_rate": 3.556545654351749e-06,
"loss": 0.0003,
"reward": 0.85625,
"reward_std": 0.2714505262672901,
"rewards/custom_reward_simplified_v7_dblog": 0.85625,
"step": 1020
},
{
"completion_length": 768.13125,
"epoch": 0.008203184109715597,
"grad_norm": 0.20112627744674683,
"kl": 0.006995444605126977,
"learning_rate": 3.5234807863152316e-06,
"loss": 0.0003,
"reward": 0.609375,
"reward_std": 0.2496856138110161,
"rewards/custom_reward_simplified_v7_dblog": 0.609375,
"step": 1030
},
{
"completion_length": 777.525,
"epoch": 0.008282826673887592,
"grad_norm": 0.2836349606513977,
"kl": 0.007392951846122741,
"learning_rate": 3.4901994150978926e-06,
"loss": 0.0003,
"reward": 0.675,
"reward_std": 0.26406350955367086,
"rewards/custom_reward_simplified_v7_dblog": 0.675,
"step": 1040
},
{
"completion_length": 719.2875,
"epoch": 0.008362469238059589,
"grad_norm": 0.1799333542585373,
"kl": 0.007057315914425999,
"learning_rate": 3.4567085809127247e-06,
"loss": 0.0003,
"reward": 0.790625,
"reward_std": 0.33950999528169634,
"rewards/custom_reward_simplified_v7_dblog": 0.790625,
"step": 1050
},
{
"completion_length": 621.4875,
"epoch": 0.008442111802231585,
"grad_norm": 0.25109627842903137,
"kl": 0.006540448497980833,
"learning_rate": 3.4230153682817112e-06,
"loss": 0.0003,
"reward": 0.85,
"reward_std": 0.30627945214509966,
"rewards/custom_reward_simplified_v7_dblog": 0.85,
"step": 1060
},
{
"completion_length": 671.04375,
"epoch": 0.00852175436640358,
"grad_norm": 0.1299162656068802,
"kl": 0.006574284215457737,
"learning_rate": 3.389126904537192e-06,
"loss": 0.0003,
"reward": 0.865625,
"reward_std": 0.37070034593343737,
"rewards/custom_reward_simplified_v7_dblog": 0.865625,
"step": 1070
},
{
"completion_length": 638.23125,
"epoch": 0.008601396930575577,
"grad_norm": 0.23796696960926056,
"kl": 0.0075248789740726355,
"learning_rate": 3.3550503583141726e-06,
"loss": 0.0003,
"reward": 0.746875,
"reward_std": 0.25020881071686746,
"rewards/custom_reward_simplified_v7_dblog": 0.746875,
"step": 1080
},
{
"completion_length": 634.9375,
"epoch": 0.008681039494747573,
"grad_norm": 0.2958204448223114,
"kl": 0.006533738202415406,
"learning_rate": 3.3207929380339034e-06,
"loss": 0.0003,
"reward": 0.896875,
"reward_std": 0.38549663573503495,
"rewards/custom_reward_simplified_v7_dblog": 0.896875,
"step": 1090
},
{
"completion_length": 661.2625,
"epoch": 0.00876068205891957,
"grad_norm": 0.007367302197962999,
"kl": 0.007355101336725056,
"learning_rate": 3.2863618903790346e-06,
"loss": 0.0003,
"reward": 0.71875,
"reward_std": 0.25932966247200967,
"rewards/custom_reward_simplified_v7_dblog": 0.71875,
"step": 1100
},
{
"completion_length": 700.48125,
"epoch": 0.008840324623091565,
"grad_norm": 0.28138336539268494,
"kl": 0.007267917576245964,
"learning_rate": 3.2517644987606827e-06,
"loss": 0.0003,
"reward": 0.9125,
"reward_std": 0.33715927675366403,
"rewards/custom_reward_simplified_v7_dblog": 0.9125,
"step": 1110
},
{
"completion_length": 662.26875,
"epoch": 0.008919967187263561,
"grad_norm": 0.1348627209663391,
"kl": 0.007481782068498433,
"learning_rate": 3.217008081777726e-06,
"loss": 0.0003,
"reward": 0.728125,
"reward_std": 0.2547163799405098,
"rewards/custom_reward_simplified_v7_dblog": 0.728125,
"step": 1120
},
{
"completion_length": 733.2125,
"epoch": 0.008999609751435557,
"grad_norm": 0.2320898026227951,
"kl": 0.007608366897329688,
"learning_rate": 3.182099991668653e-06,
"loss": 0.0003,
"reward": 0.60625,
"reward_std": 0.2975068032741547,
"rewards/custom_reward_simplified_v7_dblog": 0.60625,
"step": 1130
},
{
"completion_length": 603.5,
"epoch": 0.009079252315607553,
"grad_norm": 0.23401154577732086,
"kl": 0.007222792156971991,
"learning_rate": 3.147047612756302e-06,
"loss": 0.0003,
"reward": 0.875,
"reward_std": 0.2553515017032623,
"rewards/custom_reward_simplified_v7_dblog": 0.875,
"step": 1140
},
{
"completion_length": 704.44375,
"epoch": 0.009158894879779549,
"grad_norm": 0.2538968324661255,
"kl": 0.007968966104090213,
"learning_rate": 3.1118583598858097e-06,
"loss": 0.0003,
"reward": 0.6875,
"reward_std": 0.29204289317131044,
"rewards/custom_reward_simplified_v7_dblog": 0.6875,
"step": 1150
},
{
"completion_length": 641.88125,
"epoch": 0.009238537443951545,
"grad_norm": 0.007003675680607557,
"kl": 0.007272082474082708,
"learning_rate": 3.0765396768561005e-06,
"loss": 0.0003,
"reward": 0.875,
"reward_std": 0.2666669487953186,
"rewards/custom_reward_simplified_v7_dblog": 0.875,
"step": 1160
},
{
"completion_length": 645.55625,
"epoch": 0.009318180008123542,
"grad_norm": 0.005993107333779335,
"kl": 0.00769920782186091,
"learning_rate": 3.0410990348452572e-06,
"loss": 0.0003,
"reward": 0.846875,
"reward_std": 0.29315834268927576,
"rewards/custom_reward_simplified_v7_dblog": 0.846875,
"step": 1170
},
{
"completion_length": 690.65625,
"epoch": 0.009397822572295537,
"grad_norm": 0.196693554520607,
"kl": 0.007807633420452475,
"learning_rate": 3.0055439308300954e-06,
"loss": 0.0003,
"reward": 0.80625,
"reward_std": 0.34684801325201986,
"rewards/custom_reward_simplified_v7_dblog": 0.80625,
"step": 1180
},
{
"completion_length": 652.125,
"epoch": 0.009477465136467534,
"grad_norm": 0.009493391960859299,
"kl": 0.008702660608105362,
"learning_rate": 2.96988188600028e-06,
"loss": 0.0003,
"reward": 0.85625,
"reward_std": 0.21074047386646272,
"rewards/custom_reward_simplified_v7_dblog": 0.85625,
"step": 1190
},
{
"completion_length": 660.8125,
"epoch": 0.00955710770063953,
"grad_norm": 0.250519335269928,
"kl": 0.008729650382883846,
"learning_rate": 2.9341204441673267e-06,
"loss": 0.0003,
"reward": 0.728125,
"reward_std": 0.33152099549770353,
"rewards/custom_reward_simplified_v7_dblog": 0.728125,
"step": 1200
},
{
"completion_length": 660.7375,
"epoch": 0.009636750264811526,
"grad_norm": 0.20679971575737,
"kl": 0.00826664932537824,
"learning_rate": 2.898267170168807e-06,
"loss": 0.0003,
"reward": 0.665625,
"reward_std": 0.25403511226177217,
"rewards/custom_reward_simplified_v7_dblog": 0.665625,
"step": 1210
},
{
"completion_length": 653.59375,
"epoch": 0.009716392828983521,
"grad_norm": 0.14609546959400177,
"kl": 0.007603704649955034,
"learning_rate": 2.862329648268117e-06,
"loss": 0.0003,
"reward": 0.94375,
"reward_std": 0.26154626756906507,
"rewards/custom_reward_simplified_v7_dblog": 0.94375,
"step": 1220
},
{
"completion_length": 635.0125,
"epoch": 0.009796035393155518,
"grad_norm": 0.14301441609859467,
"kl": 0.008189951698295773,
"learning_rate": 2.82631548055013e-06,
"loss": 0.0003,
"reward": 0.9,
"reward_std": 0.2126667931675911,
"rewards/custom_reward_simplified_v7_dblog": 0.9,
"step": 1230
},
{
"completion_length": 816.5375,
"epoch": 0.009875677957327514,
"grad_norm": 0.1681988686323166,
"kl": 0.01006167777813971,
"learning_rate": 2.7902322853130758e-06,
"loss": 0.0004,
"reward": 0.51875,
"reward_std": 0.27570038065314295,
"rewards/custom_reward_simplified_v7_dblog": 0.51875,
"step": 1240
},
{
"completion_length": 710.75,
"epoch": 0.00995532052149951,
"grad_norm": 0.09834864735603333,
"kl": 0.010588118969462813,
"learning_rate": 2.754087695457005e-06,
"loss": 0.0004,
"reward": 0.6625,
"reward_std": 0.19232839569449425,
"rewards/custom_reward_simplified_v7_dblog": 0.6625,
"step": 1250
},
{
"completion_length": 615.5875,
"epoch": 0.010034963085671506,
"grad_norm": 0.14006367325782776,
"kl": 0.008278649020940065,
"learning_rate": 2.717889356869146e-06,
"loss": 0.0003,
"reward": 0.903125,
"reward_std": 0.3407335430383682,
"rewards/custom_reward_simplified_v7_dblog": 0.903125,
"step": 1260
},
{
"completion_length": 727.70625,
"epoch": 0.010114605649843502,
"grad_norm": 0.005724642425775528,
"kl": 0.009203878976404668,
"learning_rate": 2.681644926806527e-06,
"loss": 0.0004,
"reward": 0.60625,
"reward_std": 0.2156815566122532,
"rewards/custom_reward_simplified_v7_dblog": 0.60625,
"step": 1270
},
{
"completion_length": 641.9125,
"epoch": 0.0101942482140155,
"grad_norm": 0.21494239568710327,
"kl": 0.008675340004265309,
"learning_rate": 2.6453620722761897e-06,
"loss": 0.0003,
"reward": 0.81875,
"reward_std": 0.22831376343965532,
"rewards/custom_reward_simplified_v7_dblog": 0.81875,
"step": 1280
},
{
"completion_length": 650.5,
"epoch": 0.010273890778187494,
"grad_norm": 0.22972695529460907,
"kl": 0.008116158202756196,
"learning_rate": 2.6090484684133406e-06,
"loss": 0.0003,
"reward": 0.921875,
"reward_std": 0.2564812809228897,
"rewards/custom_reward_simplified_v7_dblog": 0.921875,
"step": 1290
},
{
"completion_length": 657.94375,
"epoch": 0.010353533342359491,
"grad_norm": 0.15338486433029175,
"kl": 0.009256175020709634,
"learning_rate": 2.572711796857779e-06,
"loss": 0.0004,
"reward": 0.709375,
"reward_std": 0.21537503451108933,
"rewards/custom_reward_simplified_v7_dblog": 0.709375,
"step": 1300
},
{
"completion_length": 650.58125,
"epoch": 0.010433175906531486,
"grad_norm": 0.14920295774936676,
"kl": 0.009564152918756008,
"learning_rate": 2.5363597441289574e-06,
"loss": 0.0004,
"reward": 0.828125,
"reward_std": 0.2882704295217991,
"rewards/custom_reward_simplified_v7_dblog": 0.828125,
"step": 1310
},
{
"completion_length": 723.89375,
"epoch": 0.010512818470703483,
"grad_norm": 0.20945711433887482,
"kl": 0.010788540355861187,
"learning_rate": 2.5e-06,
"loss": 0.0004,
"reward": 0.7125,
"reward_std": 0.26380954012274743,
"rewards/custom_reward_simplified_v7_dblog": 0.7125,
"step": 1320
},
{
"completion_length": 715.7,
"epoch": 0.010592461034875478,
"grad_norm": 0.16817767918109894,
"kl": 0.013910629483871163,
"learning_rate": 2.4636402558710434e-06,
"loss": 0.0006,
"reward": 0.759375,
"reward_std": 0.2193169414997101,
"rewards/custom_reward_simplified_v7_dblog": 0.759375,
"step": 1330
},
{
"completion_length": 655.90625,
"epoch": 0.010672103599047475,
"grad_norm": 0.2265154868364334,
"kl": 0.00848452327772975,
"learning_rate": 2.4272882031422216e-06,
"loss": 0.0003,
"reward": 0.78125,
"reward_std": 0.3443989932537079,
"rewards/custom_reward_simplified_v7_dblog": 0.78125,
"step": 1340
},
{
"completion_length": 660.075,
"epoch": 0.01075174616321947,
"grad_norm": 0.24644052982330322,
"kl": 0.009867909434251487,
"learning_rate": 2.3909515315866606e-06,
"loss": 0.0004,
"reward": 0.79375,
"reward_std": 0.29604131579399107,
"rewards/custom_reward_simplified_v7_dblog": 0.79375,
"step": 1350
},
{
"completion_length": 633.9125,
"epoch": 0.010831388727391467,
"grad_norm": 0.1637645810842514,
"kl": 0.00936238830909133,
"learning_rate": 2.3546379277238107e-06,
"loss": 0.0004,
"reward": 0.90625,
"reward_std": 0.324691192060709,
"rewards/custom_reward_simplified_v7_dblog": 0.90625,
"step": 1360
},
{
"completion_length": 674.03125,
"epoch": 0.010911031291563462,
"grad_norm": 0.2471015304327011,
"kl": 0.010726678185164928,
"learning_rate": 2.318355073193474e-06,
"loss": 0.0004,
"reward": 0.65625,
"reward_std": 0.21728940233588218,
"rewards/custom_reward_simplified_v7_dblog": 0.65625,
"step": 1370
},
{
"completion_length": 682.31875,
"epoch": 0.01099067385573546,
"grad_norm": 0.10079372674226761,
"kl": 0.009952771244570613,
"learning_rate": 2.2821106431308546e-06,
"loss": 0.0004,
"reward": 0.89375,
"reward_std": 0.33092204555869104,
"rewards/custom_reward_simplified_v7_dblog": 0.89375,
"step": 1380
},
{
"completion_length": 669.09375,
"epoch": 0.011070316419907454,
"grad_norm": 0.19604210555553436,
"kl": 0.011396997445262968,
"learning_rate": 2.2459123045429953e-06,
"loss": 0.0005,
"reward": 0.784375,
"reward_std": 0.29770964160561564,
"rewards/custom_reward_simplified_v7_dblog": 0.784375,
"step": 1390
},
{
"completion_length": 651.80625,
"epoch": 0.011149958984079451,
"grad_norm": 0.27397212386131287,
"kl": 0.01038803206756711,
"learning_rate": 2.2097677146869242e-06,
"loss": 0.0004,
"reward": 0.878125,
"reward_std": 0.27883157432079314,
"rewards/custom_reward_simplified_v7_dblog": 0.878125,
"step": 1400
},
{
"completion_length": 687.5125,
"epoch": 0.011229601548251448,
"grad_norm": 0.22397036850452423,
"kl": 0.012094876240007579,
"learning_rate": 2.173684519449872e-06,
"loss": 0.0005,
"reward": 0.834375,
"reward_std": 0.28866922557353974,
"rewards/custom_reward_simplified_v7_dblog": 0.834375,
"step": 1410
},
{
"completion_length": 661.26875,
"epoch": 0.011309244112423443,
"grad_norm": 0.2519758939743042,
"kl": 0.011373027227818966,
"learning_rate": 2.1376703517318835e-06,
"loss": 0.0005,
"reward": 0.853125,
"reward_std": 0.32343359887599943,
"rewards/custom_reward_simplified_v7_dblog": 0.853125,
"step": 1420
},
{
"completion_length": 677.73125,
"epoch": 0.01138888667659544,
"grad_norm": 0.2689824104309082,
"kl": 0.011312256497330964,
"learning_rate": 2.101732829831194e-06,
"loss": 0.0005,
"reward": 0.765625,
"reward_std": 0.27808423787355424,
"rewards/custom_reward_simplified_v7_dblog": 0.765625,
"step": 1430
},
{
"completion_length": 619.71875,
"epoch": 0.011468529240767435,
"grad_norm": 0.32441073656082153,
"kl": 0.010685316193848849,
"learning_rate": 2.0658795558326745e-06,
"loss": 0.0004,
"reward": 0.871875,
"reward_std": 0.2622031569480896,
"rewards/custom_reward_simplified_v7_dblog": 0.871875,
"step": 1440
},
{
"completion_length": 613.3875,
"epoch": 0.011548171804939432,
"grad_norm": 0.15561087429523468,
"kl": 0.012302201450802385,
"learning_rate": 2.0301181139997206e-06,
"loss": 0.0005,
"reward": 0.8125,
"reward_std": 0.26520399302244185,
"rewards/custom_reward_simplified_v7_dblog": 0.8125,
"step": 1450
},
{
"completion_length": 677.23125,
"epoch": 0.011627814369111427,
"grad_norm": 0.2590673267841339,
"kl": 0.011339499452151357,
"learning_rate": 1.994456069169906e-06,
"loss": 0.0005,
"reward": 0.64375,
"reward_std": 0.23993425220251083,
"rewards/custom_reward_simplified_v7_dblog": 0.64375,
"step": 1460
},
{
"completion_length": 702.1625,
"epoch": 0.011707456933283424,
"grad_norm": 0.012393876910209656,
"kl": 0.012036008480936288,
"learning_rate": 1.958900965154743e-06,
"loss": 0.0005,
"reward": 0.64375,
"reward_std": 0.21832374781370162,
"rewards/custom_reward_simplified_v7_dblog": 0.64375,
"step": 1470
},
{
"completion_length": 722.06875,
"epoch": 0.01178709949745542,
"grad_norm": 0.13200955092906952,
"kl": 0.013854384049773216,
"learning_rate": 1.9234603231439e-06,
"loss": 0.0006,
"reward": 0.790625,
"reward_std": 0.2784456007182598,
"rewards/custom_reward_simplified_v7_dblog": 0.790625,
"step": 1480
},
{
"completion_length": 664.46875,
"epoch": 0.011866742061627416,
"grad_norm": 0.14230677485466003,
"kl": 0.012553655169904232,
"learning_rate": 1.8881416401141905e-06,
"loss": 0.0005,
"reward": 0.9,
"reward_std": 0.23252918049693108,
"rewards/custom_reward_simplified_v7_dblog": 0.9,
"step": 1490
},
{
"completion_length": 653.79375,
"epoch": 0.011946384625799411,
"grad_norm": 0.17014774680137634,
"kl": 0.01346926314290613,
"learning_rate": 1.852952387243698e-06,
"loss": 0.0005,
"reward": 0.740625,
"reward_std": 0.22115055918693544,
"rewards/custom_reward_simplified_v7_dblog": 0.740625,
"step": 1500
},
{
"completion_length": 640.39375,
"epoch": 0.012026027189971408,
"grad_norm": 0.17104946076869965,
"kl": 0.013007838977500796,
"learning_rate": 1.8179000083313483e-06,
"loss": 0.0005,
"reward": 0.9,
"reward_std": 0.28725912123918534,
"rewards/custom_reward_simplified_v7_dblog": 0.9,
"step": 1510
},
{
"completion_length": 650.7125,
"epoch": 0.012105669754143405,
"grad_norm": 0.1524449735879898,
"kl": 0.012339419685304165,
"learning_rate": 1.7829919182222752e-06,
"loss": 0.0005,
"reward": 0.790625,
"reward_std": 0.3324665643274784,
"rewards/custom_reward_simplified_v7_dblog": 0.790625,
"step": 1520
},
{
"completion_length": 674.3625,
"epoch": 0.0121853123183154,
"grad_norm": 0.2344941943883896,
"kl": 0.012514100456610323,
"learning_rate": 1.7482355012393177e-06,
"loss": 0.0005,
"reward": 0.859375,
"reward_std": 0.3387090668082237,
"rewards/custom_reward_simplified_v7_dblog": 0.859375,
"step": 1530
},
{
"completion_length": 718.6,
"epoch": 0.012264954882487397,
"grad_norm": 0.2631664276123047,
"kl": 0.014576551388017833,
"learning_rate": 1.7136381096209665e-06,
"loss": 0.0006,
"reward": 0.653125,
"reward_std": 0.24619419425725936,
"rewards/custom_reward_simplified_v7_dblog": 0.653125,
"step": 1540
},
{
"completion_length": 706.28125,
"epoch": 0.012344597446659392,
"grad_norm": 0.20134921371936798,
"kl": 0.012202254333533346,
"learning_rate": 1.6792070619660977e-06,
"loss": 0.0005,
"reward": 0.84375,
"reward_std": 0.3321776181459427,
"rewards/custom_reward_simplified_v7_dblog": 0.84375,
"step": 1550
},
{
"completion_length": 645.28125,
"epoch": 0.01242424001083139,
"grad_norm": 0.1851159930229187,
"kl": 0.014482964109629393,
"learning_rate": 1.6449496416858285e-06,
"loss": 0.0006,
"reward": 0.85625,
"reward_std": 0.20507382601499557,
"rewards/custom_reward_simplified_v7_dblog": 0.85625,
"step": 1560
},
{
"completion_length": 614.08125,
"epoch": 0.012503882575003384,
"grad_norm": 0.27418458461761475,
"kl": 0.013118641986511647,
"learning_rate": 1.6108730954628093e-06,
"loss": 0.0005,
"reward": 0.79375,
"reward_std": 0.2820776253938675,
"rewards/custom_reward_simplified_v7_dblog": 0.79375,
"step": 1570
},
{
"completion_length": 695.91875,
"epoch": 0.012583525139175381,
"grad_norm": 0.2425900250673294,
"kl": 0.013333506928756834,
"learning_rate": 1.5769846317182894e-06,
"loss": 0.0005,
"reward": 0.7625,
"reward_std": 0.2879462748765945,
"rewards/custom_reward_simplified_v7_dblog": 0.7625,
"step": 1580
},
{
"completion_length": 673.99375,
"epoch": 0.012663167703347376,
"grad_norm": 0.2331763356924057,
"kl": 0.013234515953809024,
"learning_rate": 1.5432914190872757e-06,
"loss": 0.0005,
"reward": 0.775,
"reward_std": 0.2913659870624542,
"rewards/custom_reward_simplified_v7_dblog": 0.775,
"step": 1590
},
{
"completion_length": 678.54375,
"epoch": 0.012742810267519373,
"grad_norm": 0.16657988727092743,
"kl": 0.012798944069072605,
"learning_rate": 1.509800584902108e-06,
"loss": 0.0005,
"reward": 0.759375,
"reward_std": 0.2901748239994049,
"rewards/custom_reward_simplified_v7_dblog": 0.759375,
"step": 1600
},
{
"completion_length": 652.49375,
"epoch": 0.012822452831691368,
"grad_norm": 0.12168209999799728,
"kl": 0.012750855972990393,
"learning_rate": 1.4765192136847686e-06,
"loss": 0.0005,
"reward": 0.728125,
"reward_std": 0.26915703564882276,
"rewards/custom_reward_simplified_v7_dblog": 0.728125,
"step": 1610
},
{
"completion_length": 660.95625,
"epoch": 0.012902095395863365,
"grad_norm": 0.13546766340732574,
"kl": 0.013546877074986696,
"learning_rate": 1.443454345648252e-06,
"loss": 0.0005,
"reward": 0.790625,
"reward_std": 0.1937400370836258,
"rewards/custom_reward_simplified_v7_dblog": 0.790625,
"step": 1620
},
{
"completion_length": 638.00625,
"epoch": 0.012981737960035362,
"grad_norm": 0.17955924570560455,
"kl": 0.012779112858697771,
"learning_rate": 1.4106129752073023e-06,
"loss": 0.0005,
"reward": 0.790625,
"reward_std": 0.2674853280186653,
"rewards/custom_reward_simplified_v7_dblog": 0.790625,
"step": 1630
},
{
"completion_length": 678.1125,
"epoch": 0.013061380524207357,
"grad_norm": 0.2616170644760132,
"kl": 0.01720189054030925,
"learning_rate": 1.3780020494988447e-06,
"loss": 0.0007,
"reward": 0.771875,
"reward_std": 0.27255760729312895,
"rewards/custom_reward_simplified_v7_dblog": 0.771875,
"step": 1640
},
{
"completion_length": 639.43125,
"epoch": 0.013141023088379354,
"grad_norm": 0.1487816423177719,
"kl": 0.014415727299638092,
"learning_rate": 1.3456284669124159e-06,
"loss": 0.0006,
"reward": 0.73125,
"reward_std": 0.24298151433467866,
"rewards/custom_reward_simplified_v7_dblog": 0.73125,
"step": 1650
},
{
"completion_length": 727.9625,
"epoch": 0.01322066565255135,
"grad_norm": 0.14750860631465912,
"kl": 0.018067248188890515,
"learning_rate": 1.313499075630899e-06,
"loss": 0.0007,
"reward": 0.721875,
"reward_std": 0.30838647186756135,
"rewards/custom_reward_simplified_v7_dblog": 0.721875,
"step": 1660
},
{
"completion_length": 780.08125,
"epoch": 0.013300308216723346,
"grad_norm": 0.2386309951543808,
"kl": 0.017110610962845385,
"learning_rate": 1.2816206721818944e-06,
"loss": 0.0007,
"reward": 0.6375,
"reward_std": 0.26727318242192266,
"rewards/custom_reward_simplified_v7_dblog": 0.6375,
"step": 1670
},
{
"completion_length": 655.70625,
"epoch": 0.013379950780895341,
"grad_norm": 0.2751936614513397,
"kl": 0.01622524333652109,
"learning_rate": 1.2500000000000007e-06,
"loss": 0.0006,
"reward": 0.878125,
"reward_std": 0.284642493724823,
"rewards/custom_reward_simplified_v7_dblog": 0.878125,
"step": 1680
},
{
"completion_length": 684.98125,
"epoch": 0.013459593345067338,
"grad_norm": 0.23118546605110168,
"kl": 0.01642036633566022,
"learning_rate": 1.218643748000337e-06,
"loss": 0.0007,
"reward": 0.85625,
"reward_std": 0.339317075163126,
"rewards/custom_reward_simplified_v7_dblog": 0.85625,
"step": 1690
},
{
"completion_length": 743.51875,
"epoch": 0.013539235909239333,
"grad_norm": 0.22867274284362793,
"kl": 0.01721250016707927,
"learning_rate": 1.1875585491636e-06,
"loss": 0.0007,
"reward": 0.653125,
"reward_std": 0.277196903526783,
"rewards/custom_reward_simplified_v7_dblog": 0.653125,
"step": 1700
},
{
"completion_length": 637.9625,
"epoch": 0.01361887847341133,
"grad_norm": 0.2428259700536728,
"kl": 0.014563425956293941,
"learning_rate": 1.1567509791329402e-06,
"loss": 0.0006,
"reward": 0.865625,
"reward_std": 0.23967689424753189,
"rewards/custom_reward_simplified_v7_dblog": 0.865625,
"step": 1710
},
{
"completion_length": 722.925,
"epoch": 0.013698521037583325,
"grad_norm": 0.21737752854824066,
"kl": 0.014987437543459237,
"learning_rate": 1.1262275548229852e-06,
"loss": 0.0006,
"reward": 0.725,
"reward_std": 0.26179009675979614,
"rewards/custom_reward_simplified_v7_dblog": 0.725,
"step": 1720
},
{
"completion_length": 633.31875,
"epoch": 0.013778163601755322,
"grad_norm": 0.22654354572296143,
"kl": 0.013244283269159496,
"learning_rate": 1.0959947330412681e-06,
"loss": 0.0005,
"reward": 0.921875,
"reward_std": 0.2066536843776703,
"rewards/custom_reward_simplified_v7_dblog": 0.921875,
"step": 1730
},
{
"completion_length": 615.29375,
"epoch": 0.013857806165927319,
"grad_norm": 0.22673261165618896,
"kl": 0.014753601653501392,
"learning_rate": 1.0660589091223854e-06,
"loss": 0.0006,
"reward": 0.815625,
"reward_std": 0.30853241235017775,
"rewards/custom_reward_simplified_v7_dblog": 0.815625,
"step": 1740
},
{
"completion_length": 630.3625,
"epoch": 0.013937448730099314,
"grad_norm": 0.012196751311421394,
"kl": 0.01440229129511863,
"learning_rate": 1.0364264155751489e-06,
"loss": 0.0006,
"reward": 0.915625,
"reward_std": 0.23927971720695496,
"rewards/custom_reward_simplified_v7_dblog": 0.915625,
"step": 1750
},
{
"completion_length": 715.2125,
"epoch": 0.014017091294271311,
"grad_norm": 0.2587921619415283,
"kl": 0.017100332980044188,
"learning_rate": 1.0071035207430352e-06,
"loss": 0.0007,
"reward": 0.74375,
"reward_std": 0.2990465022623539,
"rewards/custom_reward_simplified_v7_dblog": 0.74375,
"step": 1760
},
{
"completion_length": 682.74375,
"epoch": 0.014096733858443306,
"grad_norm": 0.24313370883464813,
"kl": 0.01778110705781728,
"learning_rate": 9.780964274781984e-07,
"loss": 0.0007,
"reward": 0.68125,
"reward_std": 0.2005969136953354,
"rewards/custom_reward_simplified_v7_dblog": 0.68125,
"step": 1770
},
{
"completion_length": 718.31875,
"epoch": 0.014176376422615303,
"grad_norm": 0.18841393291950226,
"kl": 0.015946343122050167,
"learning_rate": 9.494112718293503e-07,
"loss": 0.0006,
"reward": 0.771875,
"reward_std": 0.27307887077331544,
"rewards/custom_reward_simplified_v7_dblog": 0.771875,
"step": 1780
},
{
"completion_length": 707.1875,
"epoch": 0.014256018986787298,
"grad_norm": 0.2333621084690094,
"kl": 0.01652351173106581,
"learning_rate": 9.210541217437566e-07,
"loss": 0.0007,
"reward": 0.8125,
"reward_std": 0.2497081995010376,
"rewards/custom_reward_simplified_v7_dblog": 0.8125,
"step": 1790
},
{
"completion_length": 728.5375,
"epoch": 0.014335661550959295,
"grad_norm": 0.26783886551856995,
"kl": 0.018553019547834993,
"learning_rate": 8.930309757836517e-07,
"loss": 0.0007,
"reward": 0.75,
"reward_std": 0.28967257887125014,
"rewards/custom_reward_simplified_v7_dblog": 0.75,
"step": 1800
},
{
"completion_length": 689.26875,
"epoch": 0.01441530411513129,
"grad_norm": 0.17589329183101654,
"kl": 0.016255489736795425,
"learning_rate": 8.653477618573261e-07,
"loss": 0.0007,
"reward": 0.765625,
"reward_std": 0.3363394603133202,
"rewards/custom_reward_simplified_v7_dblog": 0.765625,
"step": 1810
},
{
"completion_length": 640.91875,
"epoch": 0.014494946679303287,
"grad_norm": 0.21075929701328278,
"kl": 0.015922663966193795,
"learning_rate": 8.380103359651554e-07,
"loss": 0.0006,
"reward": 0.925,
"reward_std": 0.3459245666861534,
"rewards/custom_reward_simplified_v7_dblog": 0.925,
"step": 1820
},
{
"completion_length": 708.60625,
"epoch": 0.014574589243475282,
"grad_norm": 0.00766308419406414,
"kl": 0.01772608202882111,
"learning_rate": 8.110244809608494e-07,
"loss": 0.0007,
"reward": 0.73125,
"reward_std": 0.2913930006325245,
"rewards/custom_reward_simplified_v7_dblog": 0.73125,
"step": 1830
},
{
"completion_length": 660.0375,
"epoch": 0.014654231807647279,
"grad_norm": 0.20974037051200867,
"kl": 0.014227323909290135,
"learning_rate": 7.843959053281663e-07,
"loss": 0.0006,
"reward": 0.809375,
"reward_std": 0.24926668480038644,
"rewards/custom_reward_simplified_v7_dblog": 0.809375,
"step": 1840
},
{
"completion_length": 729.71875,
"epoch": 0.014733874371819274,
"grad_norm": 0.24099427461624146,
"kl": 0.018935651518404484,
"learning_rate": 7.581302419733633e-07,
"loss": 0.0008,
"reward": 0.690625,
"reward_std": 0.32810748890042307,
"rewards/custom_reward_simplified_v7_dblog": 0.690625,
"step": 1850
},
{
"completion_length": 649.98125,
"epoch": 0.014813516935991271,
"grad_norm": 0.013280795887112617,
"kl": 0.01633880774024874,
"learning_rate": 7.322330470336314e-07,
"loss": 0.0007,
"reward": 0.91875,
"reward_std": 0.24432293996214866,
"rewards/custom_reward_simplified_v7_dblog": 0.91875,
"step": 1860
},
{
"completion_length": 669.09375,
"epoch": 0.014893159500163268,
"grad_norm": 0.2837064266204834,
"kl": 0.014348302804864942,
"learning_rate": 7.067097987017762e-07,
"loss": 0.0006,
"reward": 0.690625,
"reward_std": 0.2307182878255844,
"rewards/custom_reward_simplified_v7_dblog": 0.690625,
"step": 1870
},
{
"completion_length": 662.9625,
"epoch": 0.014972802064335263,
"grad_norm": 0.25689443945884705,
"kl": 0.01656266492791474,
"learning_rate": 6.815658960673782e-07,
"loss": 0.0007,
"reward": 0.85625,
"reward_std": 0.22758262380957603,
"rewards/custom_reward_simplified_v7_dblog": 0.85625,
"step": 1880
},
{
"completion_length": 719.24375,
"epoch": 0.01505244462850726,
"grad_norm": 0.22542421519756317,
"kl": 0.01744127394631505,
"learning_rate": 6.568066579746901e-07,
"loss": 0.0007,
"reward": 0.76875,
"reward_std": 0.2790658660233021,
"rewards/custom_reward_simplified_v7_dblog": 0.76875,
"step": 1890
},
{
"completion_length": 633.64375,
"epoch": 0.015132087192679255,
"grad_norm": 0.00903425831347704,
"kl": 0.014375879801809788,
"learning_rate": 6.324373218975105e-07,
"loss": 0.0006,
"reward": 0.725,
"reward_std": 0.2382744610309601,
"rewards/custom_reward_simplified_v7_dblog": 0.725,
"step": 1900
},
{
"completion_length": 767.7375,
"epoch": 0.015211729756851252,
"grad_norm": 0.1330222189426422,
"kl": 0.02190765142440796,
"learning_rate": 6.084630428312679e-07,
"loss": 0.0009,
"reward": 0.66875,
"reward_std": 0.27546602860093117,
"rewards/custom_reward_simplified_v7_dblog": 0.66875,
"step": 1910
},
{
"completion_length": 726.63125,
"epoch": 0.015291372321023247,
"grad_norm": 0.21655875444412231,
"kl": 0.02581467442214489,
"learning_rate": 5.848888922025553e-07,
"loss": 0.001,
"reward": 0.834375,
"reward_std": 0.38373097851872445,
"rewards/custom_reward_simplified_v7_dblog": 0.834375,
"step": 1920
},
{
"completion_length": 688.56875,
"epoch": 0.015371014885195244,
"grad_norm": 0.22155120968818665,
"kl": 0.025313653564080597,
"learning_rate": 5.617198567963353e-07,
"loss": 0.001,
"reward": 0.64375,
"reward_std": 0.2539114162325859,
"rewards/custom_reward_simplified_v7_dblog": 0.64375,
"step": 1930
},
{
"completion_length": 676.9125,
"epoch": 0.01545065744936724,
"grad_norm": 0.2373446673154831,
"kl": 0.018907574540935456,
"learning_rate": 5.389608377010608e-07,
"loss": 0.0008,
"reward": 0.821875,
"reward_std": 0.1906539335846901,
"rewards/custom_reward_simplified_v7_dblog": 0.821875,
"step": 1940
},
{
"completion_length": 640.675,
"epoch": 0.015530300013539236,
"grad_norm": 0.1865774542093277,
"kl": 0.014899229886941612,
"learning_rate": 5.166166492719124e-07,
"loss": 0.0006,
"reward": 0.725,
"reward_std": 0.2747412838041782,
"rewards/custom_reward_simplified_v7_dblog": 0.725,
"step": 1950
},
{
"completion_length": 651.70625,
"epoch": 0.015609942577711231,
"grad_norm": 0.2434624284505844,
"kl": 0.01636054664850235,
"learning_rate": 4.946920181123904e-07,
"loss": 0.0007,
"reward": 0.7625,
"reward_std": 0.2852359592914581,
"rewards/custom_reward_simplified_v7_dblog": 0.7625,
"step": 1960
},
{
"completion_length": 654.6625,
"epoch": 0.015689585141883226,
"grad_norm": 0.20749981701374054,
"kl": 0.018196922447532415,
"learning_rate": 4.7319158207446953e-07,
"loss": 0.0007,
"reward": 0.715625,
"reward_std": 0.2198973834514618,
"rewards/custom_reward_simplified_v7_dblog": 0.715625,
"step": 1970
},
{
"completion_length": 641.45,
"epoch": 0.015769227706055225,
"grad_norm": 0.23187489807605743,
"kl": 0.017989515024237335,
"learning_rate": 4.5211988927752026e-07,
"loss": 0.0007,
"reward": 0.7875,
"reward_std": 0.24450960606336594,
"rewards/custom_reward_simplified_v7_dblog": 0.7875,
"step": 1980
},
{
"completion_length": 643.6375,
"epoch": 0.01584887027022722,
"grad_norm": 0.235895574092865,
"kl": 0.015841626143082977,
"learning_rate": 4.3148139714622365e-07,
"loss": 0.0006,
"reward": 0.896875,
"reward_std": 0.26189937368035315,
"rewards/custom_reward_simplified_v7_dblog": 0.896875,
"step": 1990
},
{
"completion_length": 629.60625,
"epoch": 0.015928512834399215,
"grad_norm": 0.2776155471801758,
"kl": 0.015184593386948109,
"learning_rate": 4.1128047146765936e-07,
"loss": 0.0006,
"reward": 0.921875,
"reward_std": 0.23378355875611306,
"rewards/custom_reward_simplified_v7_dblog": 0.921875,
"step": 2000
},
{
"completion_length": 710.65,
"epoch": 0.016008155398571214,
"grad_norm": 0.13598495721817017,
"kl": 0.01561300114262849,
"learning_rate": 3.915213854677863e-07,
"loss": 0.0006,
"reward": 0.859375,
"reward_std": 0.22324086129665374,
"rewards/custom_reward_simplified_v7_dblog": 0.859375,
"step": 2010
},
{
"completion_length": 600.3625,
"epoch": 0.01608779796274321,
"grad_norm": 0.33102965354919434,
"kl": 0.01562973433174193,
"learning_rate": 3.722083189075007e-07,
"loss": 0.0006,
"reward": 1.0125,
"reward_std": 0.37898894101381303,
"rewards/custom_reward_simplified_v7_dblog": 1.0125,
"step": 2020
},
{
"completion_length": 633.40625,
"epoch": 0.016167440526915204,
"grad_norm": 0.009714637883007526,
"kl": 0.01524353977292776,
"learning_rate": 3.5334535719846767e-07,
"loss": 0.0006,
"reward": 0.775,
"reward_std": 0.1905590772628784,
"rewards/custom_reward_simplified_v7_dblog": 0.775,
"step": 2030
},
{
"completion_length": 674.3625,
"epoch": 0.0162470830910872,
"grad_norm": 0.2587895095348358,
"kl": 0.015684280125424267,
"learning_rate": 3.3493649053890325e-07,
"loss": 0.0006,
"reward": 0.978125,
"reward_std": 0.33772673830389977,
"rewards/custom_reward_simplified_v7_dblog": 0.978125,
"step": 2040
},
{
"completion_length": 623.1375,
"epoch": 0.016326725655259198,
"grad_norm": 0.24910244345664978,
"kl": 0.014677197439596057,
"learning_rate": 3.1698561306951065e-07,
"loss": 0.0006,
"reward": 0.925,
"reward_std": 0.3512172996997833,
"rewards/custom_reward_simplified_v7_dblog": 0.925,
"step": 2050
},
{
"completion_length": 678.0375,
"epoch": 0.016406368219431193,
"grad_norm": 0.20536966621875763,
"kl": 0.017746813944540918,
"learning_rate": 2.9949652204972257e-07,
"loss": 0.0007,
"reward": 0.828125,
"reward_std": 0.34475562572479246,
"rewards/custom_reward_simplified_v7_dblog": 0.828125,
"step": 2060
},
{
"completion_length": 634.36875,
"epoch": 0.016486010783603188,
"grad_norm": 0.26798176765441895,
"kl": 0.017110086302272974,
"learning_rate": 2.8247291705444575e-07,
"loss": 0.0007,
"reward": 0.89375,
"reward_std": 0.24814453721046448,
"rewards/custom_reward_simplified_v7_dblog": 0.89375,
"step": 2070
},
{
"completion_length": 710.6875,
"epoch": 0.016565653347775183,
"grad_norm": 0.20649504661560059,
"kl": 0.018557686172425748,
"learning_rate": 2.6591839919146963e-07,
"loss": 0.0007,
"reward": 0.828125,
"reward_std": 0.34967463091015816,
"rewards/custom_reward_simplified_v7_dblog": 0.828125,
"step": 2080
},
{
"completion_length": 642.375,
"epoch": 0.016645295911947182,
"grad_norm": 0.016043314710259438,
"kl": 0.018814650364220142,
"learning_rate": 2.4983647033969714e-07,
"loss": 0.0008,
"reward": 0.859375,
"reward_std": 0.3110216066241264,
"rewards/custom_reward_simplified_v7_dblog": 0.859375,
"step": 2090
},
{
"completion_length": 686.65625,
"epoch": 0.016724938476119177,
"grad_norm": 0.26343393325805664,
"kl": 0.019906887435354292,
"learning_rate": 2.3423053240837518e-07,
"loss": 0.0008,
"reward": 0.715625,
"reward_std": 0.17099330350756645,
"rewards/custom_reward_simplified_v7_dblog": 0.715625,
"step": 2100
},
{
"completion_length": 656.8,
"epoch": 0.016804581040291172,
"grad_norm": 0.01307599525898695,
"kl": 0.020065448177047075,
"learning_rate": 2.1910388661746495e-07,
"loss": 0.0008,
"reward": 0.8,
"reward_std": 0.20212240219116212,
"rewards/custom_reward_simplified_v7_dblog": 0.8,
"step": 2110
},
{
"completion_length": 714.25625,
"epoch": 0.01688422360446317,
"grad_norm": 0.2202935814857483,
"kl": 0.02329984272364527,
"learning_rate": 2.044597327993153e-07,
"loss": 0.0009,
"reward": 0.7875,
"reward_std": 0.307485481351614,
"rewards/custom_reward_simplified_v7_dblog": 0.7875,
"step": 2120
},
{
"completion_length": 685.39375,
"epoch": 0.016963866168635166,
"grad_norm": 0.30204537510871887,
"kl": 0.018967814440838993,
"learning_rate": 1.9030116872178317e-07,
"loss": 0.0008,
"reward": 0.803125,
"reward_std": 0.3279333204030991,
"rewards/custom_reward_simplified_v7_dblog": 0.803125,
"step": 2130
},
{
"completion_length": 674.49375,
"epoch": 0.01704350873280716,
"grad_norm": 0.012012571096420288,
"kl": 0.02170075795147568,
"learning_rate": 1.7663118943294367e-07,
"loss": 0.0009,
"reward": 0.703125,
"reward_std": 0.2257047951221466,
"rewards/custom_reward_simplified_v7_dblog": 0.703125,
"step": 2140
},
{
"completion_length": 694.63125,
"epoch": 0.017123151296979156,
"grad_norm": 0.01635037176311016,
"kl": 0.02094450539443642,
"learning_rate": 1.6345268662752904e-07,
"loss": 0.0008,
"reward": 0.7125,
"reward_std": 0.2917635254561901,
"rewards/custom_reward_simplified_v7_dblog": 0.7125,
"step": 2150
},
{
"completion_length": 702.025,
"epoch": 0.017202793861151155,
"grad_norm": 0.008707295171916485,
"kl": 0.01914967515040189,
"learning_rate": 1.507684480352292e-07,
"loss": 0.0008,
"reward": 0.821875,
"reward_std": 0.2691307656466961,
"rewards/custom_reward_simplified_v7_dblog": 0.821875,
"step": 2160
},
{
"completion_length": 704.90625,
"epoch": 0.01728243642532315,
"grad_norm": 0.1347748190164566,
"kl": 0.017809830722399056,
"learning_rate": 1.3858115683098832e-07,
"loss": 0.0007,
"reward": 0.9,
"reward_std": 0.30937733352184293,
"rewards/custom_reward_simplified_v7_dblog": 0.9,
"step": 2170
},
{
"completion_length": 650.13125,
"epoch": 0.017362078989495145,
"grad_norm": 0.013826651498675346,
"kl": 0.017964964429847897,
"learning_rate": 1.2689339106741529e-07,
"loss": 0.0007,
"reward": 0.821875,
"reward_std": 0.2382724992930889,
"rewards/custom_reward_simplified_v7_dblog": 0.821875,
"step": 2180
},
{
"completion_length": 574.075,
"epoch": 0.01744172155366714,
"grad_norm": 0.21891085803508759,
"kl": 0.013470867811702193,
"learning_rate": 1.1570762312943295e-07,
"loss": 0.0005,
"reward": 0.9875,
"reward_std": 0.2131643146276474,
"rewards/custom_reward_simplified_v7_dblog": 0.9875,
"step": 2190
},
{
"completion_length": 645.95,
"epoch": 0.01752136411783914,
"grad_norm": 0.28153711557388306,
"kl": 0.01899058516137302,
"learning_rate": 1.0502621921127776e-07,
"loss": 0.0008,
"reward": 0.834375,
"reward_std": 0.29732906967401507,
"rewards/custom_reward_simplified_v7_dblog": 0.834375,
"step": 2200
},
{
"completion_length": 618.19375,
"epoch": 0.017601006682011134,
"grad_norm": 0.25354552268981934,
"kl": 0.016854454204440115,
"learning_rate": 9.485143881596715e-08,
"loss": 0.0007,
"reward": 0.85625,
"reward_std": 0.25810291022062304,
"rewards/custom_reward_simplified_v7_dblog": 0.85625,
"step": 2210
},
{
"completion_length": 638.425,
"epoch": 0.01768064924618313,
"grad_norm": 0.2272520810365677,
"kl": 0.018312370544299482,
"learning_rate": 8.518543427732951e-08,
"loss": 0.0007,
"reward": 0.753125,
"reward_std": 0.2212974861264229,
"rewards/custom_reward_simplified_v7_dblog": 0.753125,
"step": 2220
},
{
"completion_length": 695.54375,
"epoch": 0.017760291810355128,
"grad_norm": 0.27871131896972656,
"kl": 0.02111934470012784,
"learning_rate": 7.603025030471001e-08,
"loss": 0.0008,
"reward": 0.75,
"reward_std": 0.2767858363687992,
"rewards/custom_reward_simplified_v7_dblog": 0.75,
"step": 2230
},
{
"completion_length": 632.99375,
"epoch": 0.017839934374527123,
"grad_norm": 0.008834543637931347,
"kl": 0.016428270121105017,
"learning_rate": 6.738782355044048e-08,
"loss": 0.0007,
"reward": 0.80625,
"reward_std": 0.21589626967906952,
"rewards/custom_reward_simplified_v7_dblog": 0.80625,
"step": 2240
},
{
"completion_length": 634.0625,
"epoch": 0.017919576938699118,
"grad_norm": 0.286683052778244,
"kl": 0.016679517249576746,
"learning_rate": 5.92599822001666e-08,
"loss": 0.0007,
"reward": 0.853125,
"reward_std": 0.2905955038964748,
"rewards/custom_reward_simplified_v7_dblog": 0.853125,
"step": 2250
},
{
"completion_length": 610.85,
"epoch": 0.017999219502871113,
"grad_norm": 0.28028422594070435,
"kl": 0.017966749798506498,
"learning_rate": 5.164844558612131e-08,
"loss": 0.0007,
"reward": 0.971875,
"reward_std": 0.3067967638373375,
"rewards/custom_reward_simplified_v7_dblog": 0.971875,
"step": 2260
},
{
"completion_length": 566.9625,
"epoch": 0.018078862067043112,
"grad_norm": 0.3413483202457428,
"kl": 0.01525729293935001,
"learning_rate": 4.455482382342336e-08,
"loss": 0.0006,
"reward": 0.959375,
"reward_std": 0.3084723956882954,
"rewards/custom_reward_simplified_v7_dblog": 0.959375,
"step": 2270
},
{
"completion_length": 662.05625,
"epoch": 0.018158504631215107,
"grad_norm": 0.153013676404953,
"kl": 0.017893880722112954,
"learning_rate": 3.798061746947995e-08,
"loss": 0.0007,
"reward": 0.753125,
"reward_std": 0.2308400221168995,
"rewards/custom_reward_simplified_v7_dblog": 0.753125,
"step": 2280
},
{
"completion_length": 615.55,
"epoch": 0.018238147195387102,
"grad_norm": 0.2853679060935974,
"kl": 0.0166370629100129,
"learning_rate": 3.1927217206564884e-08,
"loss": 0.0007,
"reward": 0.74375,
"reward_std": 0.25018117427825926,
"rewards/custom_reward_simplified_v7_dblog": 0.74375,
"step": 2290
},
{
"completion_length": 710.45,
"epoch": 0.018317789759559097,
"grad_norm": 0.011245607398450375,
"kl": 0.01835272475145757,
"learning_rate": 2.6395903547638825e-08,
"loss": 0.0007,
"reward": 0.78125,
"reward_std": 0.2881218962371349,
"rewards/custom_reward_simplified_v7_dblog": 0.78125,
"step": 2300
},
{
"completion_length": 565.125,
"epoch": 0.018397432323731096,
"grad_norm": 0.25337040424346924,
"kl": 0.01471406095661223,
"learning_rate": 2.1387846565474047e-08,
"loss": 0.0006,
"reward": 1.078125,
"reward_std": 0.4393742740154266,
"rewards/custom_reward_simplified_v7_dblog": 1.078125,
"step": 2310
},
{
"completion_length": 692.70625,
"epoch": 0.01847707488790309,
"grad_norm": 0.20416221022605896,
"kl": 0.02145941834896803,
"learning_rate": 1.6904105645142443e-08,
"loss": 0.0009,
"reward": 0.5625,
"reward_std": 0.1467035911977291,
"rewards/custom_reward_simplified_v7_dblog": 0.5625,
"step": 2320
}
],
"logging_steps": 10,
"max_steps": 2400,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}