{ "best_metric": 0.7214915752410889, "best_model_checkpoint": "./output/checkpoint-750", "epoch": 16.666666666666668, "eval_steps": 150, "global_step": 750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2222222222222222, "grad_norm": 1.9090766906738281, "learning_rate": 2.9999999999999984e-06, "loss": 0.662, "step": 10 }, { "epoch": 0.4444444444444444, "grad_norm": 1.888444185256958, "learning_rate": 5.999999999999997e-06, "loss": 0.7254, "step": 20 }, { "epoch": 0.6666666666666666, "grad_norm": 1.6710277795791626, "learning_rate": 8.999999999999993e-06, "loss": 0.7334, "step": 30 }, { "epoch": 0.8888888888888888, "grad_norm": 1.5766774415969849, "learning_rate": 1.1999999999999994e-05, "loss": 0.7369, "step": 40 }, { "epoch": 1.1111111111111112, "grad_norm": 1.2492249011993408, "learning_rate": 1.499999999999999e-05, "loss": 0.8941, "step": 50 }, { "epoch": 1.3333333333333333, "grad_norm": 1.1361218690872192, "learning_rate": 1.7999999999999987e-05, "loss": 0.7144, "step": 60 }, { "epoch": 1.5555555555555556, "grad_norm": 1.5108751058578491, "learning_rate": 2.0999999999999985e-05, "loss": 0.7581, "step": 70 }, { "epoch": 1.7777777777777777, "grad_norm": 1.873094081878662, "learning_rate": 2.3999999999999987e-05, "loss": 0.7072, "step": 80 }, { "epoch": 2.0, "grad_norm": 2.0551469326019287, "learning_rate": 2.6999999999999982e-05, "loss": 0.7699, "step": 90 }, { "epoch": 2.2222222222222223, "grad_norm": 1.5656124353408813, "learning_rate": 2.999999999999998e-05, "loss": 0.7272, "step": 100 }, { "epoch": 2.4444444444444446, "grad_norm": 1.7015224695205688, "learning_rate": 2.999999702723961e-05, "loss": 0.7469, "step": 110 }, { "epoch": 2.6666666666666665, "grad_norm": 2.7948575019836426, "learning_rate": 2.9999988108959667e-05, "loss": 0.57, "step": 120 }, { "epoch": 2.888888888888889, "grad_norm": 2.1129257678985596, "learning_rate": 2.9999973245163695e-05, "loss": 0.7045, "step": 130 }, { "epoch": 3.111111111111111, "grad_norm": 2.2446517944335938, "learning_rate": 2.999995243585758e-05, "loss": 0.7167, "step": 140 }, { "epoch": 3.3333333333333335, "grad_norm": 1.9295082092285156, "learning_rate": 2.9999925681049573e-05, "loss": 0.6639, "step": 150 }, { "epoch": 3.3333333333333335, "eval_loss": 0.7685600519180298, "eval_runtime": 0.4511, "eval_samples_per_second": 22.169, "eval_steps_per_second": 22.169, "step": 150 }, { "epoch": 3.5555555555555554, "grad_norm": 2.560298204421997, "learning_rate": 2.9999892980750276e-05, "loss": 0.6943, "step": 160 }, { "epoch": 3.7777777777777777, "grad_norm": 2.3921022415161133, "learning_rate": 2.9999854334972655e-05, "loss": 0.6927, "step": 170 }, { "epoch": 4.0, "grad_norm": 2.541400909423828, "learning_rate": 2.999980974373202e-05, "loss": 0.6681, "step": 180 }, { "epoch": 4.222222222222222, "grad_norm": 1.6048011779785156, "learning_rate": 2.9999759207046055e-05, "loss": 0.59, "step": 190 }, { "epoch": 4.444444444444445, "grad_norm": 1.6808319091796875, "learning_rate": 2.9999702724934783e-05, "loss": 0.7109, "step": 200 }, { "epoch": 4.666666666666667, "grad_norm": 3.4204533100128174, "learning_rate": 2.99996402974206e-05, "loss": 0.6277, "step": 210 }, { "epoch": 4.888888888888889, "grad_norm": 2.078854560852051, "learning_rate": 2.9999571924528243e-05, "loss": 0.6732, "step": 220 }, { "epoch": 5.111111111111111, "grad_norm": 2.6296238899230957, "learning_rate": 2.9999497606284816e-05, "loss": 0.6029, "step": 230 }, { "epoch": 5.333333333333333, "grad_norm": 1.4844911098480225, "learning_rate": 2.9999417342719775e-05, "loss": 0.6941, "step": 240 }, { "epoch": 5.555555555555555, "grad_norm": 2.2181289196014404, "learning_rate": 2.9999331133864935e-05, "loss": 0.6476, "step": 250 }, { "epoch": 5.777777777777778, "grad_norm": 1.705496907234192, "learning_rate": 2.9999238979754465e-05, "loss": 0.6095, "step": 260 }, { "epoch": 6.0, "grad_norm": 1.9006123542785645, "learning_rate": 2.99991408804249e-05, "loss": 0.5755, "step": 270 }, { "epoch": 6.222222222222222, "grad_norm": 2.1468276977539062, "learning_rate": 2.999903683591511e-05, "loss": 0.5742, "step": 280 }, { "epoch": 6.444444444444445, "grad_norm": 1.3946986198425293, "learning_rate": 2.9998926846266345e-05, "loss": 0.6137, "step": 290 }, { "epoch": 6.666666666666667, "grad_norm": 2.292116641998291, "learning_rate": 2.9998810911522193e-05, "loss": 0.6226, "step": 300 }, { "epoch": 6.666666666666667, "eval_loss": 0.7391407489776611, "eval_runtime": 0.5623, "eval_samples_per_second": 17.783, "eval_steps_per_second": 17.783, "step": 300 }, { "Start_State_loss": 0.7391407489776611, "Start_State_runtime": 0.5004, "Start_State_samples_per_second": 19.984, "Start_State_steps_per_second": 19.984, "epoch": 6.666666666666667, "step": 300 }, { "SWA_loss": 0.7391407489776611, "SWA_runtime": 0.5139, "SWA_samples_per_second": 19.46, "SWA_steps_per_second": 19.46, "epoch": 6.666666666666667, "step": 300 }, { "EMA_loss": 0.7391407489776611, "EMA_runtime": 0.515, "EMA_samples_per_second": 19.417, "EMA_steps_per_second": 19.417, "epoch": 6.666666666666667, "step": 300 }, { "epoch": 6.888888888888889, "grad_norm": 2.7107701301574707, "learning_rate": 2.9998689031728615e-05, "loss": 0.6481, "step": 310 }, { "epoch": 7.111111111111111, "grad_norm": 1.764064908027649, "learning_rate": 2.9998561206933918e-05, "loss": 0.5866, "step": 320 }, { "epoch": 7.333333333333333, "grad_norm": 1.7632637023925781, "learning_rate": 2.9998427437188766e-05, "loss": 0.5798, "step": 330 }, { "epoch": 7.555555555555555, "grad_norm": 2.3483335971832275, "learning_rate": 2.999828772254618e-05, "loss": 0.6034, "step": 340 }, { "epoch": 7.777777777777778, "grad_norm": 2.47190260887146, "learning_rate": 2.9998142063061544e-05, "loss": 0.662, "step": 350 }, { "epoch": 8.0, "grad_norm": 1.323142409324646, "learning_rate": 2.9997990458792583e-05, "loss": 0.6039, "step": 360 }, { "epoch": 8.222222222222221, "grad_norm": 1.9020463228225708, "learning_rate": 2.9997832909799397e-05, "loss": 0.5489, "step": 370 }, { "epoch": 8.444444444444445, "grad_norm": 1.9343500137329102, "learning_rate": 2.9997669416144432e-05, "loss": 0.641, "step": 380 }, { "epoch": 8.666666666666666, "grad_norm": 1.0505070686340332, "learning_rate": 2.999749997789249e-05, "loss": 0.5396, "step": 390 }, { "epoch": 8.88888888888889, "grad_norm": 1.5202258825302124, "learning_rate": 2.9997324595110723e-05, "loss": 0.6543, "step": 400 }, { "epoch": 9.11111111111111, "grad_norm": 1.384507656097412, "learning_rate": 2.9997143267868663e-05, "loss": 0.5946, "step": 410 }, { "epoch": 9.333333333333334, "grad_norm": 2.468230962753296, "learning_rate": 2.999695599623817e-05, "loss": 0.6231, "step": 420 }, { "epoch": 9.555555555555555, "grad_norm": 2.60021710395813, "learning_rate": 2.9996762780293483e-05, "loss": 0.575, "step": 430 }, { "epoch": 9.777777777777779, "grad_norm": 1.5323718786239624, "learning_rate": 2.9996563620111176e-05, "loss": 0.529, "step": 440 }, { "epoch": 10.0, "grad_norm": 1.3856033086776733, "learning_rate": 2.9996358515770198e-05, "loss": 0.5417, "step": 450 }, { "epoch": 10.0, "eval_loss": 0.7259252071380615, "eval_runtime": 0.4435, "eval_samples_per_second": 22.546, "eval_steps_per_second": 22.546, "step": 450 }, { "Start_State_loss": 0.7391407489776611, "Start_State_runtime": 0.4445, "Start_State_samples_per_second": 22.495, "Start_State_steps_per_second": 22.495, "epoch": 10.0, "step": 450 }, { "Raw_Model_loss": 0.7259252071380615, "Raw_Model_runtime": 0.4437, "Raw_Model_samples_per_second": 22.539, "Raw_Model_steps_per_second": 22.539, "epoch": 10.0, "step": 450 }, { "SWA_loss": 0.733832836151123, "SWA_runtime": 0.4445, "SWA_samples_per_second": 22.496, "SWA_steps_per_second": 22.496, "epoch": 10.0, "step": 450 }, { "EMA_loss": 0.738980233669281, "EMA_runtime": 0.4444, "EMA_samples_per_second": 22.503, "EMA_steps_per_second": 22.503, "epoch": 10.0, "step": 450 }, { "epoch": 10.222222222222221, "grad_norm": 2.220747470855713, "learning_rate": 2.9996147467351836e-05, "loss": 0.5056, "step": 460 }, { "epoch": 10.444444444444445, "grad_norm": 1.4205608367919922, "learning_rate": 2.9995930474939753e-05, "loss": 0.4901, "step": 470 }, { "epoch": 10.666666666666666, "grad_norm": 1.9306081533432007, "learning_rate": 2.9995707538619954e-05, "loss": 0.6361, "step": 480 }, { "epoch": 10.88888888888889, "grad_norm": 2.1457133293151855, "learning_rate": 2.9995478658480802e-05, "loss": 0.5528, "step": 490 }, { "epoch": 11.11111111111111, "grad_norm": 1.8677959442138672, "learning_rate": 2.9995243834613023e-05, "loss": 0.5233, "step": 500 }, { "epoch": 11.333333333333334, "grad_norm": 1.6708972454071045, "learning_rate": 2.9995003067109687e-05, "loss": 0.5387, "step": 510 }, { "epoch": 11.555555555555555, "grad_norm": 2.6434991359710693, "learning_rate": 2.9994756356066226e-05, "loss": 0.5847, "step": 520 }, { "epoch": 11.777777777777779, "grad_norm": 2.2601070404052734, "learning_rate": 2.999450370158044e-05, "loss": 0.5341, "step": 530 }, { "epoch": 12.0, "grad_norm": 1.5335863828659058, "learning_rate": 2.9994245103752457e-05, "loss": 0.5242, "step": 540 }, { "epoch": 12.222222222222221, "grad_norm": 1.2394074201583862, "learning_rate": 2.999398056268479e-05, "loss": 0.5356, "step": 550 }, { "epoch": 12.444444444444445, "grad_norm": 1.472650170326233, "learning_rate": 2.9993710078482286e-05, "loss": 0.415, "step": 560 }, { "epoch": 12.666666666666666, "grad_norm": 3.3844995498657227, "learning_rate": 2.9993433651252164e-05, "loss": 0.6192, "step": 570 }, { "epoch": 12.88888888888889, "grad_norm": 1.4811444282531738, "learning_rate": 2.9993151281103986e-05, "loss": 0.5351, "step": 580 }, { "epoch": 13.11111111111111, "grad_norm": 2.4430384635925293, "learning_rate": 2.9992862968149675e-05, "loss": 0.4177, "step": 590 }, { "epoch": 13.333333333333334, "grad_norm": 2.456298351287842, "learning_rate": 2.9992568712503513e-05, "loss": 0.5317, "step": 600 }, { "epoch": 13.333333333333334, "eval_loss": 0.7220126986503601, "eval_runtime": 0.5023, "eval_samples_per_second": 19.91, "eval_steps_per_second": 19.91, "step": 600 }, { "Start_State_loss": 0.7391407489776611, "Start_State_runtime": 0.4727, "Start_State_samples_per_second": 21.154, "Start_State_steps_per_second": 21.154, "epoch": 13.333333333333334, "step": 600 }, { "Raw_Model_loss": 0.7220126986503601, "Raw_Model_runtime": 0.5347, "Raw_Model_samples_per_second": 18.703, "Raw_Model_steps_per_second": 18.703, "epoch": 13.333333333333334, "step": 600 }, { "SWA_loss": 0.7282296419143677, "SWA_runtime": 0.5752, "SWA_samples_per_second": 17.384, "SWA_steps_per_second": 17.384, "epoch": 13.333333333333334, "step": 600 }, { "EMA_loss": 0.7385488748550415, "EMA_runtime": 0.5662, "EMA_samples_per_second": 17.661, "EMA_steps_per_second": 17.661, "epoch": 13.333333333333334, "step": 600 }, { "epoch": 13.555555555555555, "grad_norm": 2.3377010822296143, "learning_rate": 2.9992268514282122e-05, "loss": 0.565, "step": 610 }, { "epoch": 13.777777777777779, "grad_norm": 2.2196319103240967, "learning_rate": 2.99919623736045e-05, "loss": 0.441, "step": 620 }, { "epoch": 14.0, "grad_norm": 2.2767350673675537, "learning_rate": 2.9991650290591996e-05, "loss": 0.6033, "step": 630 }, { "epoch": 14.222222222222221, "grad_norm": 2.253643035888672, "learning_rate": 2.99913322653683e-05, "loss": 0.4925, "step": 640 }, { "epoch": 14.444444444444445, "grad_norm": 1.8424692153930664, "learning_rate": 2.9991008298059473e-05, "loss": 0.5007, "step": 650 }, { "epoch": 14.666666666666666, "grad_norm": 1.5401960611343384, "learning_rate": 2.9990678388793924e-05, "loss": 0.5318, "step": 660 }, { "epoch": 14.88888888888889, "grad_norm": 1.2824598550796509, "learning_rate": 2.999034253770242e-05, "loss": 0.4575, "step": 670 }, { "epoch": 15.11111111111111, "grad_norm": 2.5211098194122314, "learning_rate": 2.9990000744918076e-05, "loss": 0.449, "step": 680 }, { "epoch": 15.333333333333334, "grad_norm": 1.6035919189453125, "learning_rate": 2.9989653010576372e-05, "loss": 0.4529, "step": 690 }, { "epoch": 15.555555555555555, "grad_norm": 1.4720438718795776, "learning_rate": 2.9989299334815138e-05, "loss": 0.4804, "step": 700 }, { "epoch": 15.777777777777779, "grad_norm": 2.241570472717285, "learning_rate": 2.9988939717774558e-05, "loss": 0.524, "step": 710 }, { "epoch": 16.0, "grad_norm": 1.3463960886001587, "learning_rate": 2.9988574159597174e-05, "loss": 0.5105, "step": 720 }, { "epoch": 16.22222222222222, "grad_norm": 2.1436588764190674, "learning_rate": 2.9988202660427887e-05, "loss": 0.4644, "step": 730 }, { "epoch": 16.444444444444443, "grad_norm": 3.0679702758789062, "learning_rate": 2.9987825220413937e-05, "loss": 0.5349, "step": 740 }, { "epoch": 16.666666666666668, "grad_norm": 1.908082127571106, "learning_rate": 2.998744183970494e-05, "loss": 0.4263, "step": 750 }, { "epoch": 16.666666666666668, "eval_loss": 0.7214915752410889, "eval_runtime": 0.4155, "eval_samples_per_second": 24.069, "eval_steps_per_second": 24.069, "step": 750 }, { "Start_State_loss": 0.7391407489776611, "Start_State_runtime": 0.4325, "Start_State_samples_per_second": 23.123, "Start_State_steps_per_second": 23.123, "epoch": 16.666666666666668, "step": 750 }, { "Raw_Model_loss": 0.7214915752410889, "Raw_Model_runtime": 0.4148, "Raw_Model_samples_per_second": 24.108, "Raw_Model_steps_per_second": 24.108, "epoch": 16.666666666666668, "step": 750 }, { "SWA_loss": 0.7257974147796631, "SWA_runtime": 0.4246, "SWA_samples_per_second": 23.553, "SWA_steps_per_second": 23.553, "epoch": 16.666666666666668, "step": 750 }, { "EMA_loss": 0.7391572594642639, "EMA_runtime": 0.4233, "EMA_samples_per_second": 23.622, "EMA_steps_per_second": 23.622, "epoch": 16.666666666666668, "step": 750 } ], "logging_steps": 10, "max_steps": 50000, "num_input_tokens_seen": 0, "num_train_epochs": 1112, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7981049240027136.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }