smirki's picture
Training in progress, step 700, checkpoint
900f91b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.005574979492039726,
"eval_steps": 500,
"global_step": 700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 739.7375,
"epoch": 7.964256417199609e-05,
"grad_norm": 0.17606443166732788,
"kl": 0.0006033612473402173,
"learning_rate": 6.25e-07,
"loss": 0.0,
"reward": -3.3789249688386915,
"reward_std": 1.234160715341568,
"rewards/custom_reward_logic_v2": -3.3789249688386915,
"step": 10
},
{
"completion_length": 881.3875,
"epoch": 0.00015928512834399218,
"grad_norm": 0.1873437762260437,
"kl": 0.0007326043589273468,
"learning_rate": 1.25e-06,
"loss": 0.0,
"reward": -4.281049972772598,
"reward_std": 1.4586432427167892,
"rewards/custom_reward_logic_v2": -4.281049972772598,
"step": 20
},
{
"completion_length": 798.325,
"epoch": 0.00023892769251598824,
"grad_norm": 0.16695357859134674,
"kl": 0.0007708041899604723,
"learning_rate": 1.8750000000000003e-06,
"loss": 0.0,
"reward": -3.79504998922348,
"reward_std": 1.4004287779331208,
"rewards/custom_reward_logic_v2": -3.79504998922348,
"step": 30
},
{
"completion_length": 885.76875,
"epoch": 0.00031857025668798435,
"grad_norm": 0.18757909536361694,
"kl": 0.0007291340152733028,
"learning_rate": 2.5e-06,
"loss": 0.0,
"reward": -3.8223875135183336,
"reward_std": 1.053759826719761,
"rewards/custom_reward_logic_v2": -3.8223875135183336,
"step": 40
},
{
"completion_length": 806.35625,
"epoch": 0.0003982128208599804,
"grad_norm": 0.1678084433078766,
"kl": 0.000756343750981614,
"learning_rate": 3.125e-06,
"loss": 0.0,
"reward": -3.831325000524521,
"reward_std": 1.2611359059810638,
"rewards/custom_reward_logic_v2": -3.831325000524521,
"step": 50
},
{
"completion_length": 860.95,
"epoch": 0.0004778553850319765,
"grad_norm": 0.16106949746608734,
"kl": 0.0007545762317022308,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0,
"reward": -3.99547501206398,
"reward_std": 1.233138319849968,
"rewards/custom_reward_logic_v2": -3.99547501206398,
"step": 60
},
{
"completion_length": 831.175,
"epoch": 0.0005574979492039726,
"grad_norm": 0.1723652333021164,
"kl": 0.0007971685263328254,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.0,
"reward": -4.036549943685531,
"reward_std": 1.5394920334219933,
"rewards/custom_reward_logic_v2": -4.036549943685531,
"step": 70
},
{
"completion_length": 874.325,
"epoch": 0.0006371405133759687,
"grad_norm": 0.2079666703939438,
"kl": 0.0008876581850927323,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": -3.92242501154542,
"reward_std": 1.2604085817933082,
"rewards/custom_reward_logic_v2": -3.92242501154542,
"step": 80
},
{
"completion_length": 791.91875,
"epoch": 0.0007167830775479647,
"grad_norm": 0.16253575682640076,
"kl": 0.0010255174711346626,
"learning_rate": 4.997620553954645e-06,
"loss": 0.0,
"reward": -3.364587500691414,
"reward_std": 1.2228698313236237,
"rewards/custom_reward_logic_v2": -3.364587500691414,
"step": 90
},
{
"completion_length": 846.2875,
"epoch": 0.0007964256417199608,
"grad_norm": 0.18019770085811615,
"kl": 0.0013353260728763416,
"learning_rate": 4.990486745229364e-06,
"loss": 0.0001,
"reward": -3.805912530422211,
"reward_std": 1.2458222389221192,
"rewards/custom_reward_logic_v2": -3.805912530422211,
"step": 100
},
{
"completion_length": 729.04375,
"epoch": 0.0008760682058919569,
"grad_norm": 0.1617293506860733,
"kl": 0.0018967354553751647,
"learning_rate": 4.978612153434527e-06,
"loss": 0.0001,
"reward": -3.071175017207861,
"reward_std": 1.3524149775505065,
"rewards/custom_reward_logic_v2": -3.071175017207861,
"step": 110
},
{
"completion_length": 641.74375,
"epoch": 0.000955710770063953,
"grad_norm": 0.26608461141586304,
"kl": 0.0029119997401721776,
"learning_rate": 4.962019382530521e-06,
"loss": 0.0001,
"reward": -2.690687493979931,
"reward_std": 1.0754198133945465,
"rewards/custom_reward_logic_v2": -2.690687493979931,
"step": 120
},
{
"completion_length": 883.9875,
"epoch": 0.001035353334235949,
"grad_norm": 0.7612231373786926,
"kl": 0.003597881377208978,
"learning_rate": 4.9407400177998335e-06,
"loss": 0.0001,
"reward": -3.8035999715328215,
"reward_std": 1.2502110481262207,
"rewards/custom_reward_logic_v2": -3.8035999715328215,
"step": 130
},
{
"completion_length": 524.48125,
"epoch": 0.0011149958984079452,
"grad_norm": 0.9543402791023254,
"kl": 0.08978197913384064,
"learning_rate": 4.914814565722671e-06,
"loss": 0.0036,
"reward": -2.0596874909475447,
"reward_std": 1.3678732179105282,
"rewards/custom_reward_logic_v2": -2.0596874909475447,
"step": 140
},
{
"completion_length": 46.43125,
"epoch": 0.0011946384625799412,
"grad_norm": 0.7850804328918457,
"kl": 0.3018287725746632,
"learning_rate": 4.884292376870567e-06,
"loss": 0.0121,
"reward": -0.04024999849498272,
"reward_std": 0.430637900531292,
"rewards/custom_reward_logic_v2": -0.04024999849498272,
"step": 150
},
{
"completion_length": 18.93125,
"epoch": 0.0012742810267519374,
"grad_norm": 0.032512303441762924,
"kl": 0.33459745422005654,
"learning_rate": 4.849231551964771e-06,
"loss": 0.0134,
"reward": 0.1650000035762787,
"reward_std": 0.07605109438300132,
"rewards/custom_reward_logic_v2": 0.1650000035762787,
"step": 160
},
{
"completion_length": 17.4375,
"epoch": 0.0013539235909239334,
"grad_norm": 0.02004638873040676,
"kl": 0.35064528286457064,
"learning_rate": 4.809698831278217e-06,
"loss": 0.014,
"reward": 0.08999999985098839,
"reward_std": 0.125558003783226,
"rewards/custom_reward_logic_v2": 0.08999999985098839,
"step": 170
},
{
"completion_length": 23.975,
"epoch": 0.0014335661550959294,
"grad_norm": 0.2281995564699173,
"kl": 0.3118164837360382,
"learning_rate": 4.765769467591626e-06,
"loss": 0.0125,
"reward": 0.08099999986588954,
"reward_std": 0.18301311507821083,
"rewards/custom_reward_logic_v2": 0.08099999986588954,
"step": 180
},
{
"completion_length": 17.7,
"epoch": 0.0015132087192679256,
"grad_norm": 0.20832708477973938,
"kl": 0.34881954491138456,
"learning_rate": 4.717527082945555e-06,
"loss": 0.014,
"reward": 0.14687500111758708,
"reward_std": 0.13193419948220253,
"rewards/custom_reward_logic_v2": 0.14687500111758708,
"step": 190
},
{
"completion_length": 28.76875,
"epoch": 0.0015928512834399217,
"grad_norm": 0.2148224264383316,
"kl": 0.4086977861821651,
"learning_rate": 4.665063509461098e-06,
"loss": 0.0163,
"reward": 0.06411250084638595,
"reward_std": 0.09681975245475768,
"rewards/custom_reward_logic_v2": 0.06411250084638595,
"step": 200
},
{
"completion_length": 19.3,
"epoch": 0.0016724938476119177,
"grad_norm": 0.03454764187335968,
"kl": 0.3337583176791668,
"learning_rate": 4.608478614532215e-06,
"loss": 0.0134,
"reward": 0.21312500052154065,
"reward_std": 0.1542310357093811,
"rewards/custom_reward_logic_v2": 0.21312500052154065,
"step": 210
},
{
"completion_length": 48.96875,
"epoch": 0.0017521364117839139,
"grad_norm": 0.8877259492874146,
"kl": 0.3230514988303185,
"learning_rate": 4.54788011072248e-06,
"loss": 0.0129,
"reward": -0.12147499993443489,
"reward_std": 0.4157312333583832,
"rewards/custom_reward_logic_v2": -0.12147499993443489,
"step": 220
},
{
"completion_length": 19.41875,
"epoch": 0.0018317789759559099,
"grad_norm": 0.7465932369232178,
"kl": 0.32680382803082464,
"learning_rate": 4.4833833507280884e-06,
"loss": 0.0131,
"reward": 0.14000000059604645,
"reward_std": 0.09731742069125175,
"rewards/custom_reward_logic_v2": 0.14000000059604645,
"step": 230
},
{
"completion_length": 23.425,
"epoch": 0.001911421540127906,
"grad_norm": 0.4111487567424774,
"kl": 0.3509559452533722,
"learning_rate": 4.415111107797445e-06,
"loss": 0.014,
"reward": 0.18286250159144402,
"reward_std": 0.1811980500817299,
"rewards/custom_reward_logic_v2": 0.18286250159144402,
"step": 240
},
{
"completion_length": 18.91875,
"epoch": 0.001991064104299902,
"grad_norm": 0.8882763385772705,
"kl": 0.3525215476751328,
"learning_rate": 4.34319334202531e-06,
"loss": 0.0141,
"reward": 0.17062499970197678,
"reward_std": 0.11504097878932953,
"rewards/custom_reward_logic_v2": 0.17062499970197678,
"step": 250
},
{
"completion_length": 20.475,
"epoch": 0.002070706668471898,
"grad_norm": 0.03645075112581253,
"kl": 0.3291649468243122,
"learning_rate": 4.267766952966369e-06,
"loss": 0.0132,
"reward": 0.16500000059604644,
"reward_std": 0.1858065977692604,
"rewards/custom_reward_logic_v2": 0.16500000059604644,
"step": 260
},
{
"completion_length": 24.5625,
"epoch": 0.0021503492326438944,
"grad_norm": 1.1167131662368774,
"kl": 0.33756194859743116,
"learning_rate": 4.188975519039151e-06,
"loss": 0.0135,
"reward": 0.10505000110715627,
"reward_std": 0.0828484557569027,
"rewards/custom_reward_logic_v2": 0.10505000110715627,
"step": 270
},
{
"completion_length": 19.925,
"epoch": 0.0022299917968158904,
"grad_norm": 0.8635123372077942,
"kl": 0.32979664355516436,
"learning_rate": 4.106969024216348e-06,
"loss": 0.0132,
"reward": 0.20062500163912772,
"reward_std": 0.1258012667298317,
"rewards/custom_reward_logic_v2": 0.20062500163912772,
"step": 280
},
{
"completion_length": 41.35625,
"epoch": 0.0023096343609878864,
"grad_norm": 0.7731335163116455,
"kl": 0.29900490418076514,
"learning_rate": 4.021903572521802e-06,
"loss": 0.012,
"reward": 0.13356250263750552,
"reward_std": 0.11855373680591583,
"rewards/custom_reward_logic_v2": 0.13356250263750552,
"step": 290
},
{
"completion_length": 21.425,
"epoch": 0.0023892769251598824,
"grad_norm": 0.050558220595121384,
"kl": 0.30905950888991357,
"learning_rate": 3.933941090877615e-06,
"loss": 0.0124,
"reward": 0.10625000111758709,
"reward_std": 0.07851103022694587,
"rewards/custom_reward_logic_v2": 0.10625000111758709,
"step": 300
},
{
"completion_length": 28.39375,
"epoch": 0.0024689194893318784,
"grad_norm": 1.2737127542495728,
"kl": 0.3259002223610878,
"learning_rate": 3.8432490208670605e-06,
"loss": 0.013,
"reward": 0.07012500055134296,
"reward_std": 0.21550666987895967,
"rewards/custom_reward_logic_v2": 0.07012500055134296,
"step": 310
},
{
"completion_length": 20.49375,
"epoch": 0.002548562053503875,
"grad_norm": 1.3667010068893433,
"kl": 0.32961594611406325,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0132,
"reward": 0.15562500059604645,
"reward_std": 0.14379026368260384,
"rewards/custom_reward_logic_v2": 0.15562500059604645,
"step": 320
},
{
"completion_length": 23.7625,
"epoch": 0.002628204617675871,
"grad_norm": 0.9662195444107056,
"kl": 0.3291011206805706,
"learning_rate": 3.654371533087586e-06,
"loss": 0.0132,
"reward": 0.20617500003427267,
"reward_std": 0.12530190348625184,
"rewards/custom_reward_logic_v2": 0.20617500003427267,
"step": 330
},
{
"completion_length": 19.15,
"epoch": 0.002707847181847867,
"grad_norm": 2.964785099029541,
"kl": 0.3629206448793411,
"learning_rate": 3.556545654351749e-06,
"loss": 0.0145,
"reward": 0.10437500067055225,
"reward_std": 0.12071752324700355,
"rewards/custom_reward_logic_v2": 0.10437500067055225,
"step": 340
},
{
"completion_length": 20.4875,
"epoch": 0.002787489746019863,
"grad_norm": 1.0044533014297485,
"kl": 0.3254102662205696,
"learning_rate": 3.4567085809127247e-06,
"loss": 0.013,
"reward": 0.15562499798834323,
"reward_std": 0.15355074554681777,
"rewards/custom_reward_logic_v2": 0.15562499798834323,
"step": 350
},
{
"completion_length": 21.21875,
"epoch": 0.002867132310191859,
"grad_norm": 0.8673160672187805,
"kl": 0.328788036108017,
"learning_rate": 3.3550503583141726e-06,
"loss": 0.0132,
"reward": 0.22808750197291375,
"reward_std": 0.14038661643862724,
"rewards/custom_reward_logic_v2": 0.22808750197291375,
"step": 360
},
{
"completion_length": 18.29375,
"epoch": 0.002946774874363855,
"grad_norm": 1.275578260421753,
"kl": 0.3586613781750202,
"learning_rate": 3.2517644987606827e-06,
"loss": 0.0143,
"reward": 0.09437500052154064,
"reward_std": 0.13283729180693626,
"rewards/custom_reward_logic_v2": 0.09437500052154064,
"step": 370
},
{
"completion_length": 19.625,
"epoch": 0.0030264174385358513,
"grad_norm": 1.135249376296997,
"kl": 0.3399433046579361,
"learning_rate": 3.147047612756302e-06,
"loss": 0.0136,
"reward": 0.18000000156462193,
"reward_std": 0.1102687232196331,
"rewards/custom_reward_logic_v2": 0.18000000156462193,
"step": 380
},
{
"completion_length": 18.65625,
"epoch": 0.0031060600027078473,
"grad_norm": 0.0214656013995409,
"kl": 0.3453727260231972,
"learning_rate": 3.0410990348452572e-06,
"loss": 0.0138,
"reward": 0.14312500059604644,
"reward_std": 0.21185824573040007,
"rewards/custom_reward_logic_v2": 0.14312500059604644,
"step": 390
},
{
"completion_length": 22.5875,
"epoch": 0.0031857025668798433,
"grad_norm": 1.1392817497253418,
"kl": 0.3561431519687176,
"learning_rate": 2.9341204441673267e-06,
"loss": 0.0142,
"reward": 0.09312500022351741,
"reward_std": 0.09467698186635971,
"rewards/custom_reward_logic_v2": 0.09312500022351741,
"step": 400
},
{
"completion_length": 21.9125,
"epoch": 0.0032653451310518393,
"grad_norm": 0.12919628620147705,
"kl": 0.3515960440039635,
"learning_rate": 2.82631548055013e-06,
"loss": 0.0141,
"reward": 0.07376250103116036,
"reward_std": 0.15707473903894426,
"rewards/custom_reward_logic_v2": 0.07376250103116036,
"step": 410
},
{
"completion_length": 20.325,
"epoch": 0.0033449876952238353,
"grad_norm": 0.08202961087226868,
"kl": 0.34852803200483323,
"learning_rate": 2.717889356869146e-06,
"loss": 0.0139,
"reward": 0.2074999999254942,
"reward_std": 0.11485048606991768,
"rewards/custom_reward_logic_v2": 0.2074999999254942,
"step": 420
},
{
"completion_length": 19.7375,
"epoch": 0.0034246302593958313,
"grad_norm": 1.5309367179870605,
"kl": 0.339575307816267,
"learning_rate": 2.6090484684133406e-06,
"loss": 0.0136,
"reward": 0.03375000059604645,
"reward_std": 0.08364979848265648,
"rewards/custom_reward_logic_v2": 0.03375000059604645,
"step": 430
},
{
"completion_length": 29.5375,
"epoch": 0.0035042728235678278,
"grad_norm": 0.08438611030578613,
"kl": 0.34286700189113617,
"learning_rate": 2.5e-06,
"loss": 0.0137,
"reward": 0.08044999912381172,
"reward_std": 0.16319628208875656,
"rewards/custom_reward_logic_v2": 0.08044999912381172,
"step": 440
},
{
"completion_length": 23.2125,
"epoch": 0.0035839153877398238,
"grad_norm": 0.11283387243747711,
"kl": 0.3263735562562943,
"learning_rate": 2.3909515315866606e-06,
"loss": 0.0131,
"reward": 0.11125000230967999,
"reward_std": 0.10089804157614708,
"rewards/custom_reward_logic_v2": 0.11125000230967999,
"step": 450
},
{
"completion_length": 20.14375,
"epoch": 0.0036635579519118198,
"grad_norm": 0.7745999693870544,
"kl": 0.3447819516062737,
"learning_rate": 2.2821106431308546e-06,
"loss": 0.0138,
"reward": 0.11187500096857547,
"reward_std": 0.12871785834431648,
"rewards/custom_reward_logic_v2": 0.11187500096857547,
"step": 460
},
{
"completion_length": 27.35,
"epoch": 0.003743200516083816,
"grad_norm": 1.4974488019943237,
"kl": 0.3425402037799358,
"learning_rate": 2.173684519449872e-06,
"loss": 0.0137,
"reward": 0.10542500019073486,
"reward_std": 0.22862085253000258,
"rewards/custom_reward_logic_v2": 0.10542500019073486,
"step": 470
},
{
"completion_length": 19.40625,
"epoch": 0.003822843080255812,
"grad_norm": 1.1579034328460693,
"kl": 0.3382424309849739,
"learning_rate": 2.0658795558326745e-06,
"loss": 0.0135,
"reward": 0.15562499947845937,
"reward_std": 0.1210292175412178,
"rewards/custom_reward_logic_v2": 0.15562499947845937,
"step": 480
},
{
"completion_length": 37.24375,
"epoch": 0.003902485644427808,
"grad_norm": 0.7052723169326782,
"kl": 0.35214473977684974,
"learning_rate": 1.958900965154743e-06,
"loss": 0.0141,
"reward": 0.07051250115036964,
"reward_std": 0.20176818892359732,
"rewards/custom_reward_logic_v2": 0.07051250115036964,
"step": 490
},
{
"completion_length": 19.23125,
"epoch": 0.003982128208599804,
"grad_norm": 0.062097422778606415,
"kl": 0.3568013899028301,
"learning_rate": 1.852952387243698e-06,
"loss": 0.0143,
"reward": 0.13374999910593033,
"reward_std": 0.10076134353876114,
"rewards/custom_reward_logic_v2": 0.13374999910593033,
"step": 500
},
{
"completion_length": 18.61875,
"epoch": 0.0040617707727718,
"grad_norm": 1.673584222793579,
"kl": 0.3592236742377281,
"learning_rate": 1.7482355012393177e-06,
"loss": 0.0144,
"reward": 0.14000000134110452,
"reward_std": 0.09233622029423713,
"rewards/custom_reward_logic_v2": 0.14000000134110452,
"step": 510
},
{
"completion_length": 20.41875,
"epoch": 0.004141413336943796,
"grad_norm": 1.1183210611343384,
"kl": 0.33706687912344935,
"learning_rate": 1.6449496416858285e-06,
"loss": 0.0135,
"reward": 0.10500000081956387,
"reward_std": 0.09869231358170509,
"rewards/custom_reward_logic_v2": 0.10500000081956387,
"step": 520
},
{
"completion_length": 20.8125,
"epoch": 0.004221055901115793,
"grad_norm": 0.4382721185684204,
"kl": 0.3610161267220974,
"learning_rate": 1.5432914190872757e-06,
"loss": 0.0144,
"reward": 0.19562500044703485,
"reward_std": 0.1188055507838726,
"rewards/custom_reward_logic_v2": 0.19562500044703485,
"step": 530
},
{
"completion_length": 19.46875,
"epoch": 0.004300698465287789,
"grad_norm": 1.3095043897628784,
"kl": 0.34188042730093005,
"learning_rate": 1.443454345648252e-06,
"loss": 0.0137,
"reward": 0.20749999955296516,
"reward_std": 0.19307591021060944,
"rewards/custom_reward_logic_v2": 0.20749999955296516,
"step": 540
},
{
"completion_length": 21.63125,
"epoch": 0.004380341029459785,
"grad_norm": 0.04259713739156723,
"kl": 0.3633933149278164,
"learning_rate": 1.3456284669124159e-06,
"loss": 0.0145,
"reward": 0.11875000149011612,
"reward_std": 0.1355846919119358,
"rewards/custom_reward_logic_v2": 0.11875000149011612,
"step": 550
},
{
"completion_length": 19.2625,
"epoch": 0.004459983593631781,
"grad_norm": 0.5744329690933228,
"kl": 0.3458960048854351,
"learning_rate": 1.2500000000000007e-06,
"loss": 0.0138,
"reward": 0.3293750025331974,
"reward_std": 0.104243653267622,
"rewards/custom_reward_logic_v2": 0.3293750025331974,
"step": 560
},
{
"completion_length": 20.05625,
"epoch": 0.004539626157803777,
"grad_norm": 0.1549508273601532,
"kl": 0.346449576318264,
"learning_rate": 1.1567509791329402e-06,
"loss": 0.0139,
"reward": 0.17625000029802323,
"reward_std": 0.1429968483746052,
"rewards/custom_reward_logic_v2": 0.17625000029802323,
"step": 570
},
{
"completion_length": 19.86875,
"epoch": 0.004619268721975773,
"grad_norm": 0.14351911842823029,
"kl": 0.39532790407538415,
"learning_rate": 1.0660589091223854e-06,
"loss": 0.0158,
"reward": 0.17437500022351743,
"reward_std": 0.13940104842185974,
"rewards/custom_reward_logic_v2": 0.17437500022351743,
"step": 580
},
{
"completion_length": 19.28125,
"epoch": 0.004698911286147769,
"grad_norm": 1.1975979804992676,
"kl": 0.3690756544470787,
"learning_rate": 9.780964274781984e-07,
"loss": 0.0148,
"reward": 0.20562500059604644,
"reward_std": 0.11339747980237007,
"rewards/custom_reward_logic_v2": 0.20562500059604644,
"step": 590
},
{
"completion_length": 18.3125,
"epoch": 0.004778553850319765,
"grad_norm": 0.03664500266313553,
"kl": 0.34169030636548997,
"learning_rate": 8.930309757836517e-07,
"loss": 0.0137,
"reward": 0.12624999806284903,
"reward_std": 0.1257291093468666,
"rewards/custom_reward_logic_v2": 0.12624999806284903,
"step": 600
},
{
"completion_length": 20.73125,
"epoch": 0.004858196414491761,
"grad_norm": 1.0636727809906006,
"kl": 0.32965768277645113,
"learning_rate": 8.110244809608494e-07,
"loss": 0.0132,
"reward": 0.1900000013411045,
"reward_std": 0.2276224449276924,
"rewards/custom_reward_logic_v2": 0.1900000013411045,
"step": 610
},
{
"completion_length": 21.09375,
"epoch": 0.004937838978663757,
"grad_norm": 0.6413007378578186,
"kl": 0.4171911731362343,
"learning_rate": 7.322330470336314e-07,
"loss": 0.0167,
"reward": 0.06624999977648258,
"reward_std": 0.08008950427174569,
"rewards/custom_reward_logic_v2": 0.06624999977648258,
"step": 620
},
{
"completion_length": 19.6125,
"epoch": 0.005017481542835753,
"grad_norm": 0.4128471612930298,
"kl": 0.3929149940609932,
"learning_rate": 6.568066579746901e-07,
"loss": 0.0157,
"reward": 0.1518750011920929,
"reward_std": 0.09802244454622269,
"rewards/custom_reward_logic_v2": 0.1518750011920929,
"step": 630
},
{
"completion_length": 18.51875,
"epoch": 0.00509712410700775,
"grad_norm": 1.2197966575622559,
"kl": 0.4328078910708427,
"learning_rate": 5.848888922025553e-07,
"loss": 0.0173,
"reward": 0.045000001043081286,
"reward_std": 0.08135274946689605,
"rewards/custom_reward_logic_v2": 0.045000001043081286,
"step": 640
},
{
"completion_length": 19.89375,
"epoch": 0.005176766671179746,
"grad_norm": 0.24087023735046387,
"kl": 0.39142851531505585,
"learning_rate": 5.166166492719124e-07,
"loss": 0.0157,
"reward": 0.0818750023841858,
"reward_std": 0.11013087928295136,
"rewards/custom_reward_logic_v2": 0.0818750023841858,
"step": 650
},
{
"completion_length": 19.23125,
"epoch": 0.005256409235351742,
"grad_norm": 1.1138290166854858,
"kl": 0.3609082795679569,
"learning_rate": 4.5211988927752026e-07,
"loss": 0.0144,
"reward": 0.13375000059604644,
"reward_std": 0.22290636524558066,
"rewards/custom_reward_logic_v2": 0.13375000059604644,
"step": 660
},
{
"completion_length": 21.1375,
"epoch": 0.005336051799523738,
"grad_norm": 0.7887033820152283,
"kl": 0.36398947462439535,
"learning_rate": 3.915213854677863e-07,
"loss": 0.0146,
"reward": 0.07750000171363354,
"reward_std": 0.09986742436885834,
"rewards/custom_reward_logic_v2": 0.07750000171363354,
"step": 670
},
{
"completion_length": 20.7375,
"epoch": 0.005415694363695734,
"grad_norm": 1.2118674516677856,
"kl": 0.34819948896765707,
"learning_rate": 3.3493649053890325e-07,
"loss": 0.0139,
"reward": 0.14000000059604645,
"reward_std": 0.15659263283014296,
"rewards/custom_reward_logic_v2": 0.14000000059604645,
"step": 680
},
{
"completion_length": 25.03125,
"epoch": 0.00549533692786773,
"grad_norm": 0.6678434014320374,
"kl": 0.3506194405257702,
"learning_rate": 2.8247291705444575e-07,
"loss": 0.014,
"reward": 0.10087500289082527,
"reward_std": 0.19995234534144402,
"rewards/custom_reward_logic_v2": 0.10087500289082527,
"step": 690
},
{
"completion_length": 19.75,
"epoch": 0.005574979492039726,
"grad_norm": 1.028297781944275,
"kl": 0.33339232876896857,
"learning_rate": 2.3423053240837518e-07,
"loss": 0.0133,
"reward": 0.09312500022351741,
"reward_std": 0.09688087031245232,
"rewards/custom_reward_logic_v2": 0.09312500022351741,
"step": 700
}
],
"logging_steps": 10,
"max_steps": 800,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}