| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.003783021798169814, |
| "eval_steps": 500, |
| "global_step": 475, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 760.69375, |
| "epoch": 7.964256417199609e-05, |
| "grad_norm": 0.1797904223203659, |
| "kl": 0.0005932799191214144, |
| "learning_rate": 2.0833333333333333e-07, |
| "loss": 0.0, |
| "reward": -2.440429142862558, |
| "reward_std": 1.2779221206903457, |
| "rewards/custom_reward_logic_v4_batch_streak": -2.440429142862558, |
| "step": 10 |
| }, |
| { |
| "completion_length": 872.7625, |
| "epoch": 0.00015928512834399218, |
| "grad_norm": 0.17659229040145874, |
| "kl": 0.0007143634895328433, |
| "learning_rate": 4.1666666666666667e-07, |
| "loss": 0.0, |
| "reward": -3.123613569140434, |
| "reward_std": 1.3400578200817108, |
| "rewards/custom_reward_logic_v4_batch_streak": -3.123613569140434, |
| "step": 20 |
| }, |
| { |
| "completion_length": 810.54375, |
| "epoch": 0.00023892769251598824, |
| "grad_norm": 0.15338778495788574, |
| "kl": 0.0007711865269811824, |
| "learning_rate": 6.25e-07, |
| "loss": 0.0, |
| "reward": -2.5357208490371703, |
| "reward_std": 1.530259145796299, |
| "rewards/custom_reward_logic_v4_batch_streak": -2.5357208490371703, |
| "step": 30 |
| }, |
| { |
| "completion_length": 916.375, |
| "epoch": 0.00031857025668798435, |
| "grad_norm": 0.18688252568244934, |
| "kl": 0.0007225456734886393, |
| "learning_rate": 8.333333333333333e-07, |
| "loss": 0.0, |
| "reward": -2.8683072827756404, |
| "reward_std": 1.2736640498042107, |
| "rewards/custom_reward_logic_v4_batch_streak": -2.8683072827756404, |
| "step": 40 |
| }, |
| { |
| "completion_length": 807.425, |
| "epoch": 0.0003982128208599804, |
| "grad_norm": 0.13943322002887726, |
| "kl": 0.0007514923432609067, |
| "learning_rate": 1.0416666666666667e-06, |
| "loss": 0.0, |
| "reward": -2.7300030887126923, |
| "reward_std": 1.1818634122610092, |
| "rewards/custom_reward_logic_v4_batch_streak": -2.7300030887126923, |
| "step": 50 |
| }, |
| { |
| "completion_length": 877.45625, |
| "epoch": 0.0004778553850319765, |
| "grad_norm": 0.1495009958744049, |
| "kl": 0.0007267521868925542, |
| "learning_rate": 1.25e-06, |
| "loss": 0.0, |
| "reward": -3.006270831823349, |
| "reward_std": 1.3253380209207535, |
| "rewards/custom_reward_logic_v4_batch_streak": -3.006270831823349, |
| "step": 60 |
| }, |
| { |
| "completion_length": 855.1875, |
| "epoch": 0.0005574979492039726, |
| "grad_norm": 0.17970818281173706, |
| "kl": 0.0007509023474995047, |
| "learning_rate": 1.4583333333333335e-06, |
| "loss": 0.0, |
| "reward": -3.1822583481669424, |
| "reward_std": 1.442990928888321, |
| "rewards/custom_reward_logic_v4_batch_streak": -3.1822583481669424, |
| "step": 70 |
| }, |
| { |
| "completion_length": 900.675, |
| "epoch": 0.0006371405133759687, |
| "grad_norm": 0.17815305292606354, |
| "kl": 0.0007381761737633496, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 0.0, |
| "reward": -2.6316666841506957, |
| "reward_std": 1.4472795471549034, |
| "rewards/custom_reward_logic_v4_batch_streak": -2.6316666841506957, |
| "step": 80 |
| }, |
| { |
| "completion_length": 813.15625, |
| "epoch": 0.0007167830775479647, |
| "grad_norm": 0.14456522464752197, |
| "kl": 0.0007069750688970089, |
| "learning_rate": 1.8750000000000003e-06, |
| "loss": 0.0, |
| "reward": -2.426281274855137, |
| "reward_std": 1.375066339969635, |
| "rewards/custom_reward_logic_v4_batch_streak": -2.426281274855137, |
| "step": 90 |
| }, |
| { |
| "completion_length": 878.14375, |
| "epoch": 0.0007964256417199608, |
| "grad_norm": 0.16884203255176544, |
| "kl": 0.0007481900160200894, |
| "learning_rate": 2.0833333333333334e-06, |
| "loss": 0.0, |
| "reward": -3.0540165841579436, |
| "reward_std": 1.2169389009475708, |
| "rewards/custom_reward_logic_v4_batch_streak": -3.0540165841579436, |
| "step": 100 |
| }, |
| { |
| "completion_length": 868.33125, |
| "epoch": 0.0008760682058919569, |
| "grad_norm": 0.16743966937065125, |
| "kl": 0.0007829183887224644, |
| "learning_rate": 2.2916666666666666e-06, |
| "loss": 0.0, |
| "reward": -3.056391695141792, |
| "reward_std": 1.5562937021255494, |
| "rewards/custom_reward_logic_v4_batch_streak": -3.056391695141792, |
| "step": 110 |
| }, |
| { |
| "completion_length": 720.83125, |
| "epoch": 0.000955710770063953, |
| "grad_norm": 0.24302148818969727, |
| "kl": 0.000822445034282282, |
| "learning_rate": 2.5e-06, |
| "loss": 0.0, |
| "reward": -1.9373770911246537, |
| "reward_std": 1.258049274981022, |
| "rewards/custom_reward_logic_v4_batch_streak": -1.9373770911246537, |
| "step": 120 |
| }, |
| { |
| "completion_length": 978.64375, |
| "epoch": 0.001035353334235949, |
| "grad_norm": 0.17354010045528412, |
| "kl": 0.000797444346244447, |
| "learning_rate": 2.7083333333333334e-06, |
| "loss": 0.0, |
| "reward": -3.6928374975919724, |
| "reward_std": 1.3044769436120986, |
| "rewards/custom_reward_logic_v4_batch_streak": -3.6928374975919724, |
| "step": 130 |
| }, |
| { |
| "completion_length": 829.23125, |
| "epoch": 0.0011149958984079452, |
| "grad_norm": 0.223940908908844, |
| "kl": 0.0009410725091584027, |
| "learning_rate": 2.916666666666667e-06, |
| "loss": 0.0, |
| "reward": -2.7124270781874658, |
| "reward_std": 1.4558519303798676, |
| "rewards/custom_reward_logic_v4_batch_streak": -2.7124270781874658, |
| "step": 140 |
| }, |
| { |
| "completion_length": 837.85625, |
| "epoch": 0.0011946384625799412, |
| "grad_norm": 0.1868145614862442, |
| "kl": 0.0009413436113391071, |
| "learning_rate": 3.125e-06, |
| "loss": 0.0, |
| "reward": -2.4677958875894546, |
| "reward_std": 1.2407755464315415, |
| "rewards/custom_reward_logic_v4_batch_streak": -2.4677958875894546, |
| "step": 150 |
| }, |
| { |
| "completion_length": 743.45625, |
| "epoch": 0.0012742810267519374, |
| "grad_norm": 0.1819629669189453, |
| "kl": 0.0011384535144316032, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 0.0, |
| "reward": -2.255056257545948, |
| "reward_std": 1.2829032361507415, |
| "rewards/custom_reward_logic_v4_batch_streak": -2.255056257545948, |
| "step": 160 |
| }, |
| { |
| "completion_length": 631.0125, |
| "epoch": 0.0013539235909239334, |
| "grad_norm": 0.22249050438404083, |
| "kl": 0.001341403860715218, |
| "learning_rate": 3.5416666666666673e-06, |
| "loss": 0.0001, |
| "reward": -1.380665649473667, |
| "reward_std": 1.301654589176178, |
| "rewards/custom_reward_logic_v4_batch_streak": -1.380665649473667, |
| "step": 170 |
| }, |
| { |
| "completion_length": 676.075, |
| "epoch": 0.0014335661550959294, |
| "grad_norm": 0.23651157319545746, |
| "kl": 0.001932383590610698, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 0.0001, |
| "reward": -2.217651057243347, |
| "reward_std": 1.1832541689276694, |
| "rewards/custom_reward_logic_v4_batch_streak": -2.217651057243347, |
| "step": 180 |
| }, |
| { |
| "completion_length": 716.3125, |
| "epoch": 0.0015132087192679256, |
| "grad_norm": 0.930211067199707, |
| "kl": 0.0031241982942447066, |
| "learning_rate": 3.958333333333333e-06, |
| "loss": 0.0001, |
| "reward": -2.1449333682656286, |
| "reward_std": 1.272219567000866, |
| "rewards/custom_reward_logic_v4_batch_streak": -2.1449333682656286, |
| "step": 190 |
| }, |
| { |
| "completion_length": 723.33125, |
| "epoch": 0.0015928512834399217, |
| "grad_norm": 0.23318283259868622, |
| "kl": 0.0023288113647140563, |
| "learning_rate": 4.166666666666667e-06, |
| "loss": 0.0001, |
| "reward": -2.1110031098127364, |
| "reward_std": 1.1132041677832603, |
| "rewards/custom_reward_logic_v4_batch_streak": -2.1110031098127364, |
| "step": 200 |
| }, |
| { |
| "completion_length": 706.54375, |
| "epoch": 0.0016724938476119177, |
| "grad_norm": 0.26098617911338806, |
| "kl": 0.003358338767429814, |
| "learning_rate": 4.3750000000000005e-06, |
| "loss": 0.0001, |
| "reward": -2.3224770683795213, |
| "reward_std": 1.2092776507139207, |
| "rewards/custom_reward_logic_v4_batch_streak": -2.3224770683795213, |
| "step": 210 |
| }, |
| { |
| "completion_length": 811.08125, |
| "epoch": 0.0017521364117839139, |
| "grad_norm": 1.0346773862838745, |
| "kl": 0.005868167115841061, |
| "learning_rate": 4.583333333333333e-06, |
| "loss": 0.0002, |
| "reward": -2.6366010539233686, |
| "reward_std": 1.3989030092954635, |
| "rewards/custom_reward_logic_v4_batch_streak": -2.6366010539233686, |
| "step": 220 |
| }, |
| { |
| "completion_length": 211.44375, |
| "epoch": 0.0018317789759559099, |
| "grad_norm": 0.8315042853355408, |
| "kl": 0.20073928231140598, |
| "learning_rate": 4.791666666666668e-06, |
| "loss": 0.008, |
| "reward": -0.5939687395468354, |
| "reward_std": 0.7772796258330346, |
| "rewards/custom_reward_logic_v4_batch_streak": -0.5939687395468354, |
| "step": 230 |
| }, |
| { |
| "completion_length": 27.75625, |
| "epoch": 0.001911421540127906, |
| "grad_norm": 0.7632138133049011, |
| "kl": 0.3509638696908951, |
| "learning_rate": 5e-06, |
| "loss": 0.014, |
| "reward": 0.14633646439760922, |
| "reward_std": 0.2748184122145176, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.14633646439760922, |
| "step": 240 |
| }, |
| { |
| "completion_length": 28.425, |
| "epoch": 0.001991064104299902, |
| "grad_norm": 0.5807326436042786, |
| "kl": 0.3273721463978291, |
| "learning_rate": 4.999735579817769e-06, |
| "loss": 0.0131, |
| "reward": 0.11352083273231983, |
| "reward_std": 0.2841566324234009, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.11352083273231983, |
| "step": 250 |
| }, |
| { |
| "completion_length": 19.95625, |
| "epoch": 0.002070706668471898, |
| "grad_norm": 0.03958822786808014, |
| "kl": 0.33097796961665155, |
| "learning_rate": 4.998942375205502e-06, |
| "loss": 0.0132, |
| "reward": 0.10125000216066837, |
| "reward_std": 0.1778048150241375, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.10125000216066837, |
| "step": 260 |
| }, |
| { |
| "completion_length": 25.31875, |
| "epoch": 0.0021503492326438944, |
| "grad_norm": 1.1055091619491577, |
| "kl": 0.34529968798160554, |
| "learning_rate": 4.997620553954645e-06, |
| "loss": 0.0138, |
| "reward": 0.15995520818978548, |
| "reward_std": 0.2416255235671997, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.15995520818978548, |
| "step": 270 |
| }, |
| { |
| "completion_length": 19.18125, |
| "epoch": 0.0022299917968158904, |
| "grad_norm": 0.799738883972168, |
| "kl": 0.3341792456805706, |
| "learning_rate": 4.995770395678171e-06, |
| "loss": 0.0134, |
| "reward": 0.2549999985843897, |
| "reward_std": 0.1341205656528473, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.2549999985843897, |
| "step": 280 |
| }, |
| { |
| "completion_length": 27.70625, |
| "epoch": 0.0023096343609878864, |
| "grad_norm": 0.02682190202176571, |
| "kl": 0.3167652033269405, |
| "learning_rate": 4.993392291751431e-06, |
| "loss": 0.0127, |
| "reward": 0.17528020832687616, |
| "reward_std": 0.17236635982990264, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.17528020832687616, |
| "step": 290 |
| }, |
| { |
| "completion_length": 25.6875, |
| "epoch": 0.0023892769251598824, |
| "grad_norm": 0.6407962441444397, |
| "kl": 0.32680063620209693, |
| "learning_rate": 4.990486745229364e-06, |
| "loss": 0.0131, |
| "reward": 0.16030624657869338, |
| "reward_std": 0.2103947691619396, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.16030624657869338, |
| "step": 300 |
| }, |
| { |
| "completion_length": 35.26875, |
| "epoch": 0.0024689194893318784, |
| "grad_norm": 0.8467773795127869, |
| "kl": 0.3288160003721714, |
| "learning_rate": 4.9870543707400835e-06, |
| "loss": 0.0132, |
| "reward": 0.14182916339486837, |
| "reward_std": 0.23385633081197738, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.14182916339486837, |
| "step": 310 |
| }, |
| { |
| "completion_length": 21.6, |
| "epoch": 0.002548562053503875, |
| "grad_norm": 1.2785643339157104, |
| "kl": 0.32860224805772303, |
| "learning_rate": 4.983095894354858e-06, |
| "loss": 0.0131, |
| "reward": 0.29108228590339424, |
| "reward_std": 0.22271099761128427, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.29108228590339424, |
| "step": 320 |
| }, |
| { |
| "completion_length": 19.51875, |
| "epoch": 0.002628204617675871, |
| "grad_norm": 0.04232069477438927, |
| "kl": 0.33390086218714715, |
| "learning_rate": 4.978612153434527e-06, |
| "loss": 0.0134, |
| "reward": 0.44767499435693026, |
| "reward_std": 0.18191057518124581, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.44767499435693026, |
| "step": 330 |
| }, |
| { |
| "completion_length": 18.40625, |
| "epoch": 0.002707847181847867, |
| "grad_norm": 0.9771687984466553, |
| "kl": 0.392937633395195, |
| "learning_rate": 4.973604096452361e-06, |
| "loss": 0.0157, |
| "reward": 0.2312499986961484, |
| "reward_std": 0.14022469893097878, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.2312499986961484, |
| "step": 340 |
| }, |
| { |
| "completion_length": 19.75625, |
| "epoch": 0.002787489746019863, |
| "grad_norm": 0.1014222577214241, |
| "kl": 0.33149235770106317, |
| "learning_rate": 4.968072782793436e-06, |
| "loss": 0.0133, |
| "reward": 0.2172499977052212, |
| "reward_std": 0.10931163281202316, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.2172499977052212, |
| "step": 350 |
| }, |
| { |
| "completion_length": 18.9375, |
| "epoch": 0.002867132310191859, |
| "grad_norm": 0.5375373363494873, |
| "kl": 0.33093191757798196, |
| "learning_rate": 4.962019382530521e-06, |
| "loss": 0.0132, |
| "reward": 0.3819999981671572, |
| "reward_std": 0.1661699414253235, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.3819999981671572, |
| "step": 360 |
| }, |
| { |
| "completion_length": 18.0, |
| "epoch": 0.002946774874363855, |
| "grad_norm": 1.0857776403427124, |
| "kl": 0.3886502429842949, |
| "learning_rate": 4.955445176176577e-06, |
| "loss": 0.0155, |
| "reward": 0.17512499764561654, |
| "reward_std": 0.16201305240392685, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.17512499764561654, |
| "step": 370 |
| }, |
| { |
| "completion_length": 28.75, |
| "epoch": 0.0030264174385358513, |
| "grad_norm": 1.218420147895813, |
| "kl": 0.3669083297252655, |
| "learning_rate": 4.948351554413879e-06, |
| "loss": 0.0147, |
| "reward": 0.2639281203970313, |
| "reward_std": 0.22483034804463387, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.2639281203970313, |
| "step": 380 |
| }, |
| { |
| "completion_length": 19.06875, |
| "epoch": 0.0031060600027078473, |
| "grad_norm": 0.09816683083772659, |
| "kl": 0.36456960439682007, |
| "learning_rate": 4.9407400177998335e-06, |
| "loss": 0.0146, |
| "reward": 0.12062499690800906, |
| "reward_std": 0.14960864260792733, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.12062499690800906, |
| "step": 390 |
| }, |
| { |
| "completion_length": 31.74375, |
| "epoch": 0.0031857025668798433, |
| "grad_norm": 0.959058403968811, |
| "kl": 0.33982390016317365, |
| "learning_rate": 4.93261217644956e-06, |
| "loss": 0.0136, |
| "reward": 0.0769291702657938, |
| "reward_std": 0.21129344925284385, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.0769291702657938, |
| "step": 400 |
| }, |
| { |
| "completion_length": 18.8875, |
| "epoch": 0.0032653451310518393, |
| "grad_norm": 0.06978488713502884, |
| "kl": 0.3929797440767288, |
| "learning_rate": 4.9239697496952904e-06, |
| "loss": 0.0157, |
| "reward": 0.17000000439584256, |
| "reward_std": 0.222589847445488, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.17000000439584256, |
| "step": 410 |
| }, |
| { |
| "completion_length": 20.36875, |
| "epoch": 0.0033449876952238353, |
| "grad_norm": 0.08526802808046341, |
| "kl": 0.3567995116114616, |
| "learning_rate": 4.914814565722671e-06, |
| "loss": 0.0143, |
| "reward": 0.28562499955296516, |
| "reward_std": 0.14162895157933236, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.28562499955296516, |
| "step": 420 |
| }, |
| { |
| "completion_length": 19.39375, |
| "epoch": 0.0034246302593958313, |
| "grad_norm": 0.124000184237957, |
| "kl": 0.36893701553344727, |
| "learning_rate": 4.905148561184033e-06, |
| "loss": 0.0148, |
| "reward": 0.10037499703466893, |
| "reward_std": 0.09048115760087967, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.10037499703466893, |
| "step": 430 |
| }, |
| { |
| "completion_length": 20.575, |
| "epoch": 0.0035042728235678278, |
| "grad_norm": 0.5444723963737488, |
| "kl": 0.34243927001953123, |
| "learning_rate": 4.894973780788722e-06, |
| "loss": 0.0137, |
| "reward": 0.17549999970942737, |
| "reward_std": 0.09591511413455009, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.17549999970942737, |
| "step": 440 |
| }, |
| { |
| "completion_length": 30.30625, |
| "epoch": 0.0035839153877398238, |
| "grad_norm": 0.46829524636268616, |
| "kl": 0.31388519033789636, |
| "learning_rate": 4.884292376870567e-06, |
| "loss": 0.0126, |
| "reward": 0.10238021239638329, |
| "reward_std": 0.17024155631661414, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.10238021239638329, |
| "step": 450 |
| }, |
| { |
| "completion_length": 39.55, |
| "epoch": 0.0036635579519118198, |
| "grad_norm": 0.2565569579601288, |
| "kl": 0.33122892007231713, |
| "learning_rate": 4.873106608932585e-06, |
| "loss": 0.0132, |
| "reward": 0.1605322863906622, |
| "reward_std": 0.3534850224852562, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.1605322863906622, |
| "step": 460 |
| }, |
| { |
| "completion_length": 20.33125, |
| "epoch": 0.003743200516083816, |
| "grad_norm": 0.9290266633033752, |
| "kl": 0.3362825021147728, |
| "learning_rate": 4.861418843169012e-06, |
| "loss": 0.0135, |
| "reward": 0.06937500275671482, |
| "reward_std": 0.1000722162425518, |
| "rewards/custom_reward_logic_v4_batch_streak": 0.06937500275671482, |
| "step": 470 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 2400, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 25, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|