smirki's picture
Training in progress, step 475, checkpoint
cac1e00 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.003783021798169814,
"eval_steps": 500,
"global_step": 475,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 760.69375,
"epoch": 7.964256417199609e-05,
"grad_norm": 0.1797904223203659,
"kl": 0.0005932799191214144,
"learning_rate": 2.0833333333333333e-07,
"loss": 0.0,
"reward": -2.440429142862558,
"reward_std": 1.2779221206903457,
"rewards/custom_reward_logic_v4_batch_streak": -2.440429142862558,
"step": 10
},
{
"completion_length": 872.7625,
"epoch": 0.00015928512834399218,
"grad_norm": 0.17659229040145874,
"kl": 0.0007143634895328433,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.0,
"reward": -3.123613569140434,
"reward_std": 1.3400578200817108,
"rewards/custom_reward_logic_v4_batch_streak": -3.123613569140434,
"step": 20
},
{
"completion_length": 810.54375,
"epoch": 0.00023892769251598824,
"grad_norm": 0.15338778495788574,
"kl": 0.0007711865269811824,
"learning_rate": 6.25e-07,
"loss": 0.0,
"reward": -2.5357208490371703,
"reward_std": 1.530259145796299,
"rewards/custom_reward_logic_v4_batch_streak": -2.5357208490371703,
"step": 30
},
{
"completion_length": 916.375,
"epoch": 0.00031857025668798435,
"grad_norm": 0.18688252568244934,
"kl": 0.0007225456734886393,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0,
"reward": -2.8683072827756404,
"reward_std": 1.2736640498042107,
"rewards/custom_reward_logic_v4_batch_streak": -2.8683072827756404,
"step": 40
},
{
"completion_length": 807.425,
"epoch": 0.0003982128208599804,
"grad_norm": 0.13943322002887726,
"kl": 0.0007514923432609067,
"learning_rate": 1.0416666666666667e-06,
"loss": 0.0,
"reward": -2.7300030887126923,
"reward_std": 1.1818634122610092,
"rewards/custom_reward_logic_v4_batch_streak": -2.7300030887126923,
"step": 50
},
{
"completion_length": 877.45625,
"epoch": 0.0004778553850319765,
"grad_norm": 0.1495009958744049,
"kl": 0.0007267521868925542,
"learning_rate": 1.25e-06,
"loss": 0.0,
"reward": -3.006270831823349,
"reward_std": 1.3253380209207535,
"rewards/custom_reward_logic_v4_batch_streak": -3.006270831823349,
"step": 60
},
{
"completion_length": 855.1875,
"epoch": 0.0005574979492039726,
"grad_norm": 0.17970818281173706,
"kl": 0.0007509023474995047,
"learning_rate": 1.4583333333333335e-06,
"loss": 0.0,
"reward": -3.1822583481669424,
"reward_std": 1.442990928888321,
"rewards/custom_reward_logic_v4_batch_streak": -3.1822583481669424,
"step": 70
},
{
"completion_length": 900.675,
"epoch": 0.0006371405133759687,
"grad_norm": 0.17815305292606354,
"kl": 0.0007381761737633496,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0,
"reward": -2.6316666841506957,
"reward_std": 1.4472795471549034,
"rewards/custom_reward_logic_v4_batch_streak": -2.6316666841506957,
"step": 80
},
{
"completion_length": 813.15625,
"epoch": 0.0007167830775479647,
"grad_norm": 0.14456522464752197,
"kl": 0.0007069750688970089,
"learning_rate": 1.8750000000000003e-06,
"loss": 0.0,
"reward": -2.426281274855137,
"reward_std": 1.375066339969635,
"rewards/custom_reward_logic_v4_batch_streak": -2.426281274855137,
"step": 90
},
{
"completion_length": 878.14375,
"epoch": 0.0007964256417199608,
"grad_norm": 0.16884203255176544,
"kl": 0.0007481900160200894,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.0,
"reward": -3.0540165841579436,
"reward_std": 1.2169389009475708,
"rewards/custom_reward_logic_v4_batch_streak": -3.0540165841579436,
"step": 100
},
{
"completion_length": 868.33125,
"epoch": 0.0008760682058919569,
"grad_norm": 0.16743966937065125,
"kl": 0.0007829183887224644,
"learning_rate": 2.2916666666666666e-06,
"loss": 0.0,
"reward": -3.056391695141792,
"reward_std": 1.5562937021255494,
"rewards/custom_reward_logic_v4_batch_streak": -3.056391695141792,
"step": 110
},
{
"completion_length": 720.83125,
"epoch": 0.000955710770063953,
"grad_norm": 0.24302148818969727,
"kl": 0.000822445034282282,
"learning_rate": 2.5e-06,
"loss": 0.0,
"reward": -1.9373770911246537,
"reward_std": 1.258049274981022,
"rewards/custom_reward_logic_v4_batch_streak": -1.9373770911246537,
"step": 120
},
{
"completion_length": 978.64375,
"epoch": 0.001035353334235949,
"grad_norm": 0.17354010045528412,
"kl": 0.000797444346244447,
"learning_rate": 2.7083333333333334e-06,
"loss": 0.0,
"reward": -3.6928374975919724,
"reward_std": 1.3044769436120986,
"rewards/custom_reward_logic_v4_batch_streak": -3.6928374975919724,
"step": 130
},
{
"completion_length": 829.23125,
"epoch": 0.0011149958984079452,
"grad_norm": 0.223940908908844,
"kl": 0.0009410725091584027,
"learning_rate": 2.916666666666667e-06,
"loss": 0.0,
"reward": -2.7124270781874658,
"reward_std": 1.4558519303798676,
"rewards/custom_reward_logic_v4_batch_streak": -2.7124270781874658,
"step": 140
},
{
"completion_length": 837.85625,
"epoch": 0.0011946384625799412,
"grad_norm": 0.1868145614862442,
"kl": 0.0009413436113391071,
"learning_rate": 3.125e-06,
"loss": 0.0,
"reward": -2.4677958875894546,
"reward_std": 1.2407755464315415,
"rewards/custom_reward_logic_v4_batch_streak": -2.4677958875894546,
"step": 150
},
{
"completion_length": 743.45625,
"epoch": 0.0012742810267519374,
"grad_norm": 0.1819629669189453,
"kl": 0.0011384535144316032,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0,
"reward": -2.255056257545948,
"reward_std": 1.2829032361507415,
"rewards/custom_reward_logic_v4_batch_streak": -2.255056257545948,
"step": 160
},
{
"completion_length": 631.0125,
"epoch": 0.0013539235909239334,
"grad_norm": 0.22249050438404083,
"kl": 0.001341403860715218,
"learning_rate": 3.5416666666666673e-06,
"loss": 0.0001,
"reward": -1.380665649473667,
"reward_std": 1.301654589176178,
"rewards/custom_reward_logic_v4_batch_streak": -1.380665649473667,
"step": 170
},
{
"completion_length": 676.075,
"epoch": 0.0014335661550959294,
"grad_norm": 0.23651157319545746,
"kl": 0.001932383590610698,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0001,
"reward": -2.217651057243347,
"reward_std": 1.1832541689276694,
"rewards/custom_reward_logic_v4_batch_streak": -2.217651057243347,
"step": 180
},
{
"completion_length": 716.3125,
"epoch": 0.0015132087192679256,
"grad_norm": 0.930211067199707,
"kl": 0.0031241982942447066,
"learning_rate": 3.958333333333333e-06,
"loss": 0.0001,
"reward": -2.1449333682656286,
"reward_std": 1.272219567000866,
"rewards/custom_reward_logic_v4_batch_streak": -2.1449333682656286,
"step": 190
},
{
"completion_length": 723.33125,
"epoch": 0.0015928512834399217,
"grad_norm": 0.23318283259868622,
"kl": 0.0023288113647140563,
"learning_rate": 4.166666666666667e-06,
"loss": 0.0001,
"reward": -2.1110031098127364,
"reward_std": 1.1132041677832603,
"rewards/custom_reward_logic_v4_batch_streak": -2.1110031098127364,
"step": 200
},
{
"completion_length": 706.54375,
"epoch": 0.0016724938476119177,
"grad_norm": 0.26098617911338806,
"kl": 0.003358338767429814,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.0001,
"reward": -2.3224770683795213,
"reward_std": 1.2092776507139207,
"rewards/custom_reward_logic_v4_batch_streak": -2.3224770683795213,
"step": 210
},
{
"completion_length": 811.08125,
"epoch": 0.0017521364117839139,
"grad_norm": 1.0346773862838745,
"kl": 0.005868167115841061,
"learning_rate": 4.583333333333333e-06,
"loss": 0.0002,
"reward": -2.6366010539233686,
"reward_std": 1.3989030092954635,
"rewards/custom_reward_logic_v4_batch_streak": -2.6366010539233686,
"step": 220
},
{
"completion_length": 211.44375,
"epoch": 0.0018317789759559099,
"grad_norm": 0.8315042853355408,
"kl": 0.20073928231140598,
"learning_rate": 4.791666666666668e-06,
"loss": 0.008,
"reward": -0.5939687395468354,
"reward_std": 0.7772796258330346,
"rewards/custom_reward_logic_v4_batch_streak": -0.5939687395468354,
"step": 230
},
{
"completion_length": 27.75625,
"epoch": 0.001911421540127906,
"grad_norm": 0.7632138133049011,
"kl": 0.3509638696908951,
"learning_rate": 5e-06,
"loss": 0.014,
"reward": 0.14633646439760922,
"reward_std": 0.2748184122145176,
"rewards/custom_reward_logic_v4_batch_streak": 0.14633646439760922,
"step": 240
},
{
"completion_length": 28.425,
"epoch": 0.001991064104299902,
"grad_norm": 0.5807326436042786,
"kl": 0.3273721463978291,
"learning_rate": 4.999735579817769e-06,
"loss": 0.0131,
"reward": 0.11352083273231983,
"reward_std": 0.2841566324234009,
"rewards/custom_reward_logic_v4_batch_streak": 0.11352083273231983,
"step": 250
},
{
"completion_length": 19.95625,
"epoch": 0.002070706668471898,
"grad_norm": 0.03958822786808014,
"kl": 0.33097796961665155,
"learning_rate": 4.998942375205502e-06,
"loss": 0.0132,
"reward": 0.10125000216066837,
"reward_std": 0.1778048150241375,
"rewards/custom_reward_logic_v4_batch_streak": 0.10125000216066837,
"step": 260
},
{
"completion_length": 25.31875,
"epoch": 0.0021503492326438944,
"grad_norm": 1.1055091619491577,
"kl": 0.34529968798160554,
"learning_rate": 4.997620553954645e-06,
"loss": 0.0138,
"reward": 0.15995520818978548,
"reward_std": 0.2416255235671997,
"rewards/custom_reward_logic_v4_batch_streak": 0.15995520818978548,
"step": 270
},
{
"completion_length": 19.18125,
"epoch": 0.0022299917968158904,
"grad_norm": 0.799738883972168,
"kl": 0.3341792456805706,
"learning_rate": 4.995770395678171e-06,
"loss": 0.0134,
"reward": 0.2549999985843897,
"reward_std": 0.1341205656528473,
"rewards/custom_reward_logic_v4_batch_streak": 0.2549999985843897,
"step": 280
},
{
"completion_length": 27.70625,
"epoch": 0.0023096343609878864,
"grad_norm": 0.02682190202176571,
"kl": 0.3167652033269405,
"learning_rate": 4.993392291751431e-06,
"loss": 0.0127,
"reward": 0.17528020832687616,
"reward_std": 0.17236635982990264,
"rewards/custom_reward_logic_v4_batch_streak": 0.17528020832687616,
"step": 290
},
{
"completion_length": 25.6875,
"epoch": 0.0023892769251598824,
"grad_norm": 0.6407962441444397,
"kl": 0.32680063620209693,
"learning_rate": 4.990486745229364e-06,
"loss": 0.0131,
"reward": 0.16030624657869338,
"reward_std": 0.2103947691619396,
"rewards/custom_reward_logic_v4_batch_streak": 0.16030624657869338,
"step": 300
},
{
"completion_length": 35.26875,
"epoch": 0.0024689194893318784,
"grad_norm": 0.8467773795127869,
"kl": 0.3288160003721714,
"learning_rate": 4.9870543707400835e-06,
"loss": 0.0132,
"reward": 0.14182916339486837,
"reward_std": 0.23385633081197738,
"rewards/custom_reward_logic_v4_batch_streak": 0.14182916339486837,
"step": 310
},
{
"completion_length": 21.6,
"epoch": 0.002548562053503875,
"grad_norm": 1.2785643339157104,
"kl": 0.32860224805772303,
"learning_rate": 4.983095894354858e-06,
"loss": 0.0131,
"reward": 0.29108228590339424,
"reward_std": 0.22271099761128427,
"rewards/custom_reward_logic_v4_batch_streak": 0.29108228590339424,
"step": 320
},
{
"completion_length": 19.51875,
"epoch": 0.002628204617675871,
"grad_norm": 0.04232069477438927,
"kl": 0.33390086218714715,
"learning_rate": 4.978612153434527e-06,
"loss": 0.0134,
"reward": 0.44767499435693026,
"reward_std": 0.18191057518124581,
"rewards/custom_reward_logic_v4_batch_streak": 0.44767499435693026,
"step": 330
},
{
"completion_length": 18.40625,
"epoch": 0.002707847181847867,
"grad_norm": 0.9771687984466553,
"kl": 0.392937633395195,
"learning_rate": 4.973604096452361e-06,
"loss": 0.0157,
"reward": 0.2312499986961484,
"reward_std": 0.14022469893097878,
"rewards/custom_reward_logic_v4_batch_streak": 0.2312499986961484,
"step": 340
},
{
"completion_length": 19.75625,
"epoch": 0.002787489746019863,
"grad_norm": 0.1014222577214241,
"kl": 0.33149235770106317,
"learning_rate": 4.968072782793436e-06,
"loss": 0.0133,
"reward": 0.2172499977052212,
"reward_std": 0.10931163281202316,
"rewards/custom_reward_logic_v4_batch_streak": 0.2172499977052212,
"step": 350
},
{
"completion_length": 18.9375,
"epoch": 0.002867132310191859,
"grad_norm": 0.5375373363494873,
"kl": 0.33093191757798196,
"learning_rate": 4.962019382530521e-06,
"loss": 0.0132,
"reward": 0.3819999981671572,
"reward_std": 0.1661699414253235,
"rewards/custom_reward_logic_v4_batch_streak": 0.3819999981671572,
"step": 360
},
{
"completion_length": 18.0,
"epoch": 0.002946774874363855,
"grad_norm": 1.0857776403427124,
"kl": 0.3886502429842949,
"learning_rate": 4.955445176176577e-06,
"loss": 0.0155,
"reward": 0.17512499764561654,
"reward_std": 0.16201305240392685,
"rewards/custom_reward_logic_v4_batch_streak": 0.17512499764561654,
"step": 370
},
{
"completion_length": 28.75,
"epoch": 0.0030264174385358513,
"grad_norm": 1.218420147895813,
"kl": 0.3669083297252655,
"learning_rate": 4.948351554413879e-06,
"loss": 0.0147,
"reward": 0.2639281203970313,
"reward_std": 0.22483034804463387,
"rewards/custom_reward_logic_v4_batch_streak": 0.2639281203970313,
"step": 380
},
{
"completion_length": 19.06875,
"epoch": 0.0031060600027078473,
"grad_norm": 0.09816683083772659,
"kl": 0.36456960439682007,
"learning_rate": 4.9407400177998335e-06,
"loss": 0.0146,
"reward": 0.12062499690800906,
"reward_std": 0.14960864260792733,
"rewards/custom_reward_logic_v4_batch_streak": 0.12062499690800906,
"step": 390
},
{
"completion_length": 31.74375,
"epoch": 0.0031857025668798433,
"grad_norm": 0.959058403968811,
"kl": 0.33982390016317365,
"learning_rate": 4.93261217644956e-06,
"loss": 0.0136,
"reward": 0.0769291702657938,
"reward_std": 0.21129344925284385,
"rewards/custom_reward_logic_v4_batch_streak": 0.0769291702657938,
"step": 400
},
{
"completion_length": 18.8875,
"epoch": 0.0032653451310518393,
"grad_norm": 0.06978488713502884,
"kl": 0.3929797440767288,
"learning_rate": 4.9239697496952904e-06,
"loss": 0.0157,
"reward": 0.17000000439584256,
"reward_std": 0.222589847445488,
"rewards/custom_reward_logic_v4_batch_streak": 0.17000000439584256,
"step": 410
},
{
"completion_length": 20.36875,
"epoch": 0.0033449876952238353,
"grad_norm": 0.08526802808046341,
"kl": 0.3567995116114616,
"learning_rate": 4.914814565722671e-06,
"loss": 0.0143,
"reward": 0.28562499955296516,
"reward_std": 0.14162895157933236,
"rewards/custom_reward_logic_v4_batch_streak": 0.28562499955296516,
"step": 420
},
{
"completion_length": 19.39375,
"epoch": 0.0034246302593958313,
"grad_norm": 0.124000184237957,
"kl": 0.36893701553344727,
"learning_rate": 4.905148561184033e-06,
"loss": 0.0148,
"reward": 0.10037499703466893,
"reward_std": 0.09048115760087967,
"rewards/custom_reward_logic_v4_batch_streak": 0.10037499703466893,
"step": 430
},
{
"completion_length": 20.575,
"epoch": 0.0035042728235678278,
"grad_norm": 0.5444723963737488,
"kl": 0.34243927001953123,
"learning_rate": 4.894973780788722e-06,
"loss": 0.0137,
"reward": 0.17549999970942737,
"reward_std": 0.09591511413455009,
"rewards/custom_reward_logic_v4_batch_streak": 0.17549999970942737,
"step": 440
},
{
"completion_length": 30.30625,
"epoch": 0.0035839153877398238,
"grad_norm": 0.46829524636268616,
"kl": 0.31388519033789636,
"learning_rate": 4.884292376870567e-06,
"loss": 0.0126,
"reward": 0.10238021239638329,
"reward_std": 0.17024155631661414,
"rewards/custom_reward_logic_v4_batch_streak": 0.10238021239638329,
"step": 450
},
{
"completion_length": 39.55,
"epoch": 0.0036635579519118198,
"grad_norm": 0.2565569579601288,
"kl": 0.33122892007231713,
"learning_rate": 4.873106608932585e-06,
"loss": 0.0132,
"reward": 0.1605322863906622,
"reward_std": 0.3534850224852562,
"rewards/custom_reward_logic_v4_batch_streak": 0.1605322863906622,
"step": 460
},
{
"completion_length": 20.33125,
"epoch": 0.003743200516083816,
"grad_norm": 0.9290266633033752,
"kl": 0.3362825021147728,
"learning_rate": 4.861418843169012e-06,
"loss": 0.0135,
"reward": 0.06937500275671482,
"reward_std": 0.1000722162425518,
"rewards/custom_reward_logic_v4_batch_streak": 0.06937500275671482,
"step": 470
}
],
"logging_steps": 10,
"max_steps": 2400,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}