{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.003783021798169814, "eval_steps": 500, "global_step": 475, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 760.69375, "epoch": 7.964256417199609e-05, "grad_norm": 0.1797904223203659, "kl": 0.0005932799191214144, "learning_rate": 2.0833333333333333e-07, "loss": 0.0, "reward": -2.440429142862558, "reward_std": 1.2779221206903457, "rewards/custom_reward_logic_v4_batch_streak": -2.440429142862558, "step": 10 }, { "completion_length": 872.7625, "epoch": 0.00015928512834399218, "grad_norm": 0.17659229040145874, "kl": 0.0007143634895328433, "learning_rate": 4.1666666666666667e-07, "loss": 0.0, "reward": -3.123613569140434, "reward_std": 1.3400578200817108, "rewards/custom_reward_logic_v4_batch_streak": -3.123613569140434, "step": 20 }, { "completion_length": 810.54375, "epoch": 0.00023892769251598824, "grad_norm": 0.15338778495788574, "kl": 0.0007711865269811824, "learning_rate": 6.25e-07, "loss": 0.0, "reward": -2.5357208490371703, "reward_std": 1.530259145796299, "rewards/custom_reward_logic_v4_batch_streak": -2.5357208490371703, "step": 30 }, { "completion_length": 916.375, "epoch": 0.00031857025668798435, "grad_norm": 0.18688252568244934, "kl": 0.0007225456734886393, "learning_rate": 8.333333333333333e-07, "loss": 0.0, "reward": -2.8683072827756404, "reward_std": 1.2736640498042107, "rewards/custom_reward_logic_v4_batch_streak": -2.8683072827756404, "step": 40 }, { "completion_length": 807.425, "epoch": 0.0003982128208599804, "grad_norm": 0.13943322002887726, "kl": 0.0007514923432609067, "learning_rate": 1.0416666666666667e-06, "loss": 0.0, "reward": -2.7300030887126923, "reward_std": 1.1818634122610092, "rewards/custom_reward_logic_v4_batch_streak": -2.7300030887126923, "step": 50 }, { "completion_length": 877.45625, "epoch": 0.0004778553850319765, "grad_norm": 0.1495009958744049, "kl": 0.0007267521868925542, "learning_rate": 1.25e-06, "loss": 0.0, "reward": -3.006270831823349, "reward_std": 1.3253380209207535, "rewards/custom_reward_logic_v4_batch_streak": -3.006270831823349, "step": 60 }, { "completion_length": 855.1875, "epoch": 0.0005574979492039726, "grad_norm": 0.17970818281173706, "kl": 0.0007509023474995047, "learning_rate": 1.4583333333333335e-06, "loss": 0.0, "reward": -3.1822583481669424, "reward_std": 1.442990928888321, "rewards/custom_reward_logic_v4_batch_streak": -3.1822583481669424, "step": 70 }, { "completion_length": 900.675, "epoch": 0.0006371405133759687, "grad_norm": 0.17815305292606354, "kl": 0.0007381761737633496, "learning_rate": 1.6666666666666667e-06, "loss": 0.0, "reward": -2.6316666841506957, "reward_std": 1.4472795471549034, "rewards/custom_reward_logic_v4_batch_streak": -2.6316666841506957, "step": 80 }, { "completion_length": 813.15625, "epoch": 0.0007167830775479647, "grad_norm": 0.14456522464752197, "kl": 0.0007069750688970089, "learning_rate": 1.8750000000000003e-06, "loss": 0.0, "reward": -2.426281274855137, "reward_std": 1.375066339969635, "rewards/custom_reward_logic_v4_batch_streak": -2.426281274855137, "step": 90 }, { "completion_length": 878.14375, "epoch": 0.0007964256417199608, "grad_norm": 0.16884203255176544, "kl": 0.0007481900160200894, "learning_rate": 2.0833333333333334e-06, "loss": 0.0, "reward": -3.0540165841579436, "reward_std": 1.2169389009475708, "rewards/custom_reward_logic_v4_batch_streak": -3.0540165841579436, "step": 100 }, { "completion_length": 868.33125, "epoch": 0.0008760682058919569, "grad_norm": 0.16743966937065125, "kl": 0.0007829183887224644, "learning_rate": 2.2916666666666666e-06, "loss": 0.0, "reward": -3.056391695141792, "reward_std": 1.5562937021255494, "rewards/custom_reward_logic_v4_batch_streak": -3.056391695141792, "step": 110 }, { "completion_length": 720.83125, "epoch": 0.000955710770063953, "grad_norm": 0.24302148818969727, "kl": 0.000822445034282282, "learning_rate": 2.5e-06, "loss": 0.0, "reward": -1.9373770911246537, "reward_std": 1.258049274981022, "rewards/custom_reward_logic_v4_batch_streak": -1.9373770911246537, "step": 120 }, { "completion_length": 978.64375, "epoch": 0.001035353334235949, "grad_norm": 0.17354010045528412, "kl": 0.000797444346244447, "learning_rate": 2.7083333333333334e-06, "loss": 0.0, "reward": -3.6928374975919724, "reward_std": 1.3044769436120986, "rewards/custom_reward_logic_v4_batch_streak": -3.6928374975919724, "step": 130 }, { "completion_length": 829.23125, "epoch": 0.0011149958984079452, "grad_norm": 0.223940908908844, "kl": 0.0009410725091584027, "learning_rate": 2.916666666666667e-06, "loss": 0.0, "reward": -2.7124270781874658, "reward_std": 1.4558519303798676, "rewards/custom_reward_logic_v4_batch_streak": -2.7124270781874658, "step": 140 }, { "completion_length": 837.85625, "epoch": 0.0011946384625799412, "grad_norm": 0.1868145614862442, "kl": 0.0009413436113391071, "learning_rate": 3.125e-06, "loss": 0.0, "reward": -2.4677958875894546, "reward_std": 1.2407755464315415, "rewards/custom_reward_logic_v4_batch_streak": -2.4677958875894546, "step": 150 }, { "completion_length": 743.45625, "epoch": 0.0012742810267519374, "grad_norm": 0.1819629669189453, "kl": 0.0011384535144316032, "learning_rate": 3.3333333333333333e-06, "loss": 0.0, "reward": -2.255056257545948, "reward_std": 1.2829032361507415, "rewards/custom_reward_logic_v4_batch_streak": -2.255056257545948, "step": 160 }, { "completion_length": 631.0125, "epoch": 0.0013539235909239334, "grad_norm": 0.22249050438404083, "kl": 0.001341403860715218, "learning_rate": 3.5416666666666673e-06, "loss": 0.0001, "reward": -1.380665649473667, "reward_std": 1.301654589176178, "rewards/custom_reward_logic_v4_batch_streak": -1.380665649473667, "step": 170 }, { "completion_length": 676.075, "epoch": 0.0014335661550959294, "grad_norm": 0.23651157319545746, "kl": 0.001932383590610698, "learning_rate": 3.7500000000000005e-06, "loss": 0.0001, "reward": -2.217651057243347, "reward_std": 1.1832541689276694, "rewards/custom_reward_logic_v4_batch_streak": -2.217651057243347, "step": 180 }, { "completion_length": 716.3125, "epoch": 0.0015132087192679256, "grad_norm": 0.930211067199707, "kl": 0.0031241982942447066, "learning_rate": 3.958333333333333e-06, "loss": 0.0001, "reward": -2.1449333682656286, "reward_std": 1.272219567000866, "rewards/custom_reward_logic_v4_batch_streak": -2.1449333682656286, "step": 190 }, { "completion_length": 723.33125, "epoch": 0.0015928512834399217, "grad_norm": 0.23318283259868622, "kl": 0.0023288113647140563, "learning_rate": 4.166666666666667e-06, "loss": 0.0001, "reward": -2.1110031098127364, "reward_std": 1.1132041677832603, "rewards/custom_reward_logic_v4_batch_streak": -2.1110031098127364, "step": 200 }, { "completion_length": 706.54375, "epoch": 0.0016724938476119177, "grad_norm": 0.26098617911338806, "kl": 0.003358338767429814, "learning_rate": 4.3750000000000005e-06, "loss": 0.0001, "reward": -2.3224770683795213, "reward_std": 1.2092776507139207, "rewards/custom_reward_logic_v4_batch_streak": -2.3224770683795213, "step": 210 }, { "completion_length": 811.08125, "epoch": 0.0017521364117839139, "grad_norm": 1.0346773862838745, "kl": 0.005868167115841061, "learning_rate": 4.583333333333333e-06, "loss": 0.0002, "reward": -2.6366010539233686, "reward_std": 1.3989030092954635, "rewards/custom_reward_logic_v4_batch_streak": -2.6366010539233686, "step": 220 }, { "completion_length": 211.44375, "epoch": 0.0018317789759559099, "grad_norm": 0.8315042853355408, "kl": 0.20073928231140598, "learning_rate": 4.791666666666668e-06, "loss": 0.008, "reward": -0.5939687395468354, "reward_std": 0.7772796258330346, "rewards/custom_reward_logic_v4_batch_streak": -0.5939687395468354, "step": 230 }, { "completion_length": 27.75625, "epoch": 0.001911421540127906, "grad_norm": 0.7632138133049011, "kl": 0.3509638696908951, "learning_rate": 5e-06, "loss": 0.014, "reward": 0.14633646439760922, "reward_std": 0.2748184122145176, "rewards/custom_reward_logic_v4_batch_streak": 0.14633646439760922, "step": 240 }, { "completion_length": 28.425, "epoch": 0.001991064104299902, "grad_norm": 0.5807326436042786, "kl": 0.3273721463978291, "learning_rate": 4.999735579817769e-06, "loss": 0.0131, "reward": 0.11352083273231983, "reward_std": 0.2841566324234009, "rewards/custom_reward_logic_v4_batch_streak": 0.11352083273231983, "step": 250 }, { "completion_length": 19.95625, "epoch": 0.002070706668471898, "grad_norm": 0.03958822786808014, "kl": 0.33097796961665155, "learning_rate": 4.998942375205502e-06, "loss": 0.0132, "reward": 0.10125000216066837, "reward_std": 0.1778048150241375, "rewards/custom_reward_logic_v4_batch_streak": 0.10125000216066837, "step": 260 }, { "completion_length": 25.31875, "epoch": 0.0021503492326438944, "grad_norm": 1.1055091619491577, "kl": 0.34529968798160554, "learning_rate": 4.997620553954645e-06, "loss": 0.0138, "reward": 0.15995520818978548, "reward_std": 0.2416255235671997, "rewards/custom_reward_logic_v4_batch_streak": 0.15995520818978548, "step": 270 }, { "completion_length": 19.18125, "epoch": 0.0022299917968158904, "grad_norm": 0.799738883972168, "kl": 0.3341792456805706, "learning_rate": 4.995770395678171e-06, "loss": 0.0134, "reward": 0.2549999985843897, "reward_std": 0.1341205656528473, "rewards/custom_reward_logic_v4_batch_streak": 0.2549999985843897, "step": 280 }, { "completion_length": 27.70625, "epoch": 0.0023096343609878864, "grad_norm": 0.02682190202176571, "kl": 0.3167652033269405, "learning_rate": 4.993392291751431e-06, "loss": 0.0127, "reward": 0.17528020832687616, "reward_std": 0.17236635982990264, "rewards/custom_reward_logic_v4_batch_streak": 0.17528020832687616, "step": 290 }, { "completion_length": 25.6875, "epoch": 0.0023892769251598824, "grad_norm": 0.6407962441444397, "kl": 0.32680063620209693, "learning_rate": 4.990486745229364e-06, "loss": 0.0131, "reward": 0.16030624657869338, "reward_std": 0.2103947691619396, "rewards/custom_reward_logic_v4_batch_streak": 0.16030624657869338, "step": 300 }, { "completion_length": 35.26875, "epoch": 0.0024689194893318784, "grad_norm": 0.8467773795127869, "kl": 0.3288160003721714, "learning_rate": 4.9870543707400835e-06, "loss": 0.0132, "reward": 0.14182916339486837, "reward_std": 0.23385633081197738, "rewards/custom_reward_logic_v4_batch_streak": 0.14182916339486837, "step": 310 }, { "completion_length": 21.6, "epoch": 0.002548562053503875, "grad_norm": 1.2785643339157104, "kl": 0.32860224805772303, "learning_rate": 4.983095894354858e-06, "loss": 0.0131, "reward": 0.29108228590339424, "reward_std": 0.22271099761128427, "rewards/custom_reward_logic_v4_batch_streak": 0.29108228590339424, "step": 320 }, { "completion_length": 19.51875, "epoch": 0.002628204617675871, "grad_norm": 0.04232069477438927, "kl": 0.33390086218714715, "learning_rate": 4.978612153434527e-06, "loss": 0.0134, "reward": 0.44767499435693026, "reward_std": 0.18191057518124581, "rewards/custom_reward_logic_v4_batch_streak": 0.44767499435693026, "step": 330 }, { "completion_length": 18.40625, "epoch": 0.002707847181847867, "grad_norm": 0.9771687984466553, "kl": 0.392937633395195, "learning_rate": 4.973604096452361e-06, "loss": 0.0157, "reward": 0.2312499986961484, "reward_std": 0.14022469893097878, "rewards/custom_reward_logic_v4_batch_streak": 0.2312499986961484, "step": 340 }, { "completion_length": 19.75625, "epoch": 0.002787489746019863, "grad_norm": 0.1014222577214241, "kl": 0.33149235770106317, "learning_rate": 4.968072782793436e-06, "loss": 0.0133, "reward": 0.2172499977052212, "reward_std": 0.10931163281202316, "rewards/custom_reward_logic_v4_batch_streak": 0.2172499977052212, "step": 350 }, { "completion_length": 18.9375, "epoch": 0.002867132310191859, "grad_norm": 0.5375373363494873, "kl": 0.33093191757798196, "learning_rate": 4.962019382530521e-06, "loss": 0.0132, "reward": 0.3819999981671572, "reward_std": 0.1661699414253235, "rewards/custom_reward_logic_v4_batch_streak": 0.3819999981671572, "step": 360 }, { "completion_length": 18.0, "epoch": 0.002946774874363855, "grad_norm": 1.0857776403427124, "kl": 0.3886502429842949, "learning_rate": 4.955445176176577e-06, "loss": 0.0155, "reward": 0.17512499764561654, "reward_std": 0.16201305240392685, "rewards/custom_reward_logic_v4_batch_streak": 0.17512499764561654, "step": 370 }, { "completion_length": 28.75, "epoch": 0.0030264174385358513, "grad_norm": 1.218420147895813, "kl": 0.3669083297252655, "learning_rate": 4.948351554413879e-06, "loss": 0.0147, "reward": 0.2639281203970313, "reward_std": 0.22483034804463387, "rewards/custom_reward_logic_v4_batch_streak": 0.2639281203970313, "step": 380 }, { "completion_length": 19.06875, "epoch": 0.0031060600027078473, "grad_norm": 0.09816683083772659, "kl": 0.36456960439682007, "learning_rate": 4.9407400177998335e-06, "loss": 0.0146, "reward": 0.12062499690800906, "reward_std": 0.14960864260792733, "rewards/custom_reward_logic_v4_batch_streak": 0.12062499690800906, "step": 390 }, { "completion_length": 31.74375, "epoch": 0.0031857025668798433, "grad_norm": 0.959058403968811, "kl": 0.33982390016317365, "learning_rate": 4.93261217644956e-06, "loss": 0.0136, "reward": 0.0769291702657938, "reward_std": 0.21129344925284385, "rewards/custom_reward_logic_v4_batch_streak": 0.0769291702657938, "step": 400 }, { "completion_length": 18.8875, "epoch": 0.0032653451310518393, "grad_norm": 0.06978488713502884, "kl": 0.3929797440767288, "learning_rate": 4.9239697496952904e-06, "loss": 0.0157, "reward": 0.17000000439584256, "reward_std": 0.222589847445488, "rewards/custom_reward_logic_v4_batch_streak": 0.17000000439584256, "step": 410 }, { "completion_length": 20.36875, "epoch": 0.0033449876952238353, "grad_norm": 0.08526802808046341, "kl": 0.3567995116114616, "learning_rate": 4.914814565722671e-06, "loss": 0.0143, "reward": 0.28562499955296516, "reward_std": 0.14162895157933236, "rewards/custom_reward_logic_v4_batch_streak": 0.28562499955296516, "step": 420 }, { "completion_length": 19.39375, "epoch": 0.0034246302593958313, "grad_norm": 0.124000184237957, "kl": 0.36893701553344727, "learning_rate": 4.905148561184033e-06, "loss": 0.0148, "reward": 0.10037499703466893, "reward_std": 0.09048115760087967, "rewards/custom_reward_logic_v4_batch_streak": 0.10037499703466893, "step": 430 }, { "completion_length": 20.575, "epoch": 0.0035042728235678278, "grad_norm": 0.5444723963737488, "kl": 0.34243927001953123, "learning_rate": 4.894973780788722e-06, "loss": 0.0137, "reward": 0.17549999970942737, "reward_std": 0.09591511413455009, "rewards/custom_reward_logic_v4_batch_streak": 0.17549999970942737, "step": 440 }, { "completion_length": 30.30625, "epoch": 0.0035839153877398238, "grad_norm": 0.46829524636268616, "kl": 0.31388519033789636, "learning_rate": 4.884292376870567e-06, "loss": 0.0126, "reward": 0.10238021239638329, "reward_std": 0.17024155631661414, "rewards/custom_reward_logic_v4_batch_streak": 0.10238021239638329, "step": 450 }, { "completion_length": 39.55, "epoch": 0.0036635579519118198, "grad_norm": 0.2565569579601288, "kl": 0.33122892007231713, "learning_rate": 4.873106608932585e-06, "loss": 0.0132, "reward": 0.1605322863906622, "reward_std": 0.3534850224852562, "rewards/custom_reward_logic_v4_batch_streak": 0.1605322863906622, "step": 460 }, { "completion_length": 20.33125, "epoch": 0.003743200516083816, "grad_norm": 0.9290266633033752, "kl": 0.3362825021147728, "learning_rate": 4.861418843169012e-06, "loss": 0.0135, "reward": 0.06937500275671482, "reward_std": 0.1000722162425518, "rewards/custom_reward_logic_v4_batch_streak": 0.06937500275671482, "step": 470 } ], "logging_steps": 10, "max_steps": 2400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }