{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 1284, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09345794392523364, "grad_norm": 0.25848760281456373, "learning_rate": 9.743589743589744e-05, "loss": 1.0955, "step": 20 }, { "epoch": 0.18691588785046728, "grad_norm": 0.12152540967981099, "learning_rate": 0.0002, "loss": 0.8826, "step": 40 }, { "epoch": 0.2803738317757009, "grad_norm": 0.08766356744406019, "learning_rate": 0.00019987267934654538, "loss": 0.7805, "step": 60 }, { "epoch": 0.37383177570093457, "grad_norm": 0.07045964353767724, "learning_rate": 0.00019949104159715743, "loss": 0.7223, "step": 80 }, { "epoch": 0.4672897196261682, "grad_norm": 0.09463916110765375, "learning_rate": 0.00019885605855918885, "loss": 0.6933, "step": 100 }, { "epoch": 0.5607476635514018, "grad_norm": 0.09122862084573752, "learning_rate": 0.0001979693471617462, "loss": 0.6758, "step": 120 }, { "epoch": 0.6542056074766355, "grad_norm": 0.10577377685042465, "learning_rate": 0.00019683316533832042, "loss": 0.6605, "step": 140 }, { "epoch": 0.7476635514018691, "grad_norm": 0.11067163440878329, "learning_rate": 0.0001954504062771555, "loss": 0.6538, "step": 160 }, { "epoch": 0.8411214953271028, "grad_norm": 0.1121421761603273, "learning_rate": 0.00019382459105399632, "loss": 0.6534, "step": 180 }, { "epoch": 0.9345794392523364, "grad_norm": 0.10812490276750944, "learning_rate": 0.00019195985966597494, "loss": 0.6428, "step": 200 }, { "epoch": 1.02803738317757, "grad_norm": 0.11935465896028669, "learning_rate": 0.00018986096048946824, "loss": 0.6327, "step": 220 }, { "epoch": 1.1214953271028036, "grad_norm": 0.11781728745931254, "learning_rate": 0.0001875332381887699, "loss": 0.6322, "step": 240 }, { "epoch": 1.2149532710280373, "grad_norm": 0.1316640938577259, "learning_rate": 0.00018498262010636774, "loss": 0.6235, "step": 260 }, { "epoch": 1.308411214953271, "grad_norm": 0.13767423202221835, "learning_rate": 0.00018221560116948103, "loss": 0.6176, "step": 280 }, { "epoch": 1.4018691588785046, "grad_norm": 0.1255781456618538, "learning_rate": 0.00017923922735129302, "loss": 0.6194, "step": 300 }, { "epoch": 1.4953271028037383, "grad_norm": 0.12566754750336923, "learning_rate": 0.00017606107772899287, "loss": 0.6169, "step": 320 }, { "epoch": 1.588785046728972, "grad_norm": 0.1224054760200796, "learning_rate": 0.00017268924518431438, "loss": 0.6126, "step": 340 }, { "epoch": 1.6822429906542056, "grad_norm": 0.12754703201153322, "learning_rate": 0.00016913231579571608, "loss": 0.6127, "step": 360 }, { "epoch": 1.7757009345794392, "grad_norm": 0.13837660664241036, "learning_rate": 0.00016539934697467894, "loss": 0.6097, "step": 380 }, { "epoch": 1.8691588785046729, "grad_norm": 0.11566328949125707, "learning_rate": 0.00016149984440179537, "loss": 0.6039, "step": 400 }, { "epoch": 1.9626168224299065, "grad_norm": 0.13012304346488843, "learning_rate": 0.00015744373782137992, "loss": 0.6028, "step": 420 }, { "epoch": 2.05607476635514, "grad_norm": 0.13699329797526194, "learning_rate": 0.00015324135575623857, "loss": 0.5963, "step": 440 }, { "epoch": 2.149532710280374, "grad_norm": 0.12056530916496998, "learning_rate": 0.00014890339920698334, "loss": 0.5891, "step": 460 }, { "epoch": 2.2429906542056073, "grad_norm": 0.12376168201657378, "learning_rate": 0.0001444409144028644, "loss": 0.5913, "step": 480 }, { "epoch": 2.336448598130841, "grad_norm": 0.1285403052607913, "learning_rate": 0.0001398652646735076, "loss": 0.5893, "step": 500 }, { "epoch": 2.4299065420560746, "grad_norm": 0.12392132477639638, "learning_rate": 0.0001351881015131833, "loss": 0.5883, "step": 520 }, { "epoch": 2.5233644859813085, "grad_norm": 0.1338793765627719, "learning_rate": 0.00013042133491128935, "loss": 0.5859, "step": 540 }, { "epoch": 2.616822429906542, "grad_norm": 0.12415978424557397, "learning_rate": 0.00012557710302459803, "loss": 0.5856, "step": 560 }, { "epoch": 2.710280373831776, "grad_norm": 0.1313564813990853, "learning_rate": 0.00012066774126849529, "loss": 0.5893, "step": 580 }, { "epoch": 2.803738317757009, "grad_norm": 0.11960890307280825, "learning_rate": 0.00011570575090591791, "loss": 0.5849, "step": 600 }, { "epoch": 2.897196261682243, "grad_norm": 0.11986525148452289, "learning_rate": 0.00011070376721397373, "loss": 0.5861, "step": 620 }, { "epoch": 2.9906542056074765, "grad_norm": 0.12094208349063497, "learning_rate": 0.00010567452730930743, "loss": 0.5843, "step": 640 }, { "epoch": 3.0841121495327104, "grad_norm": 0.12592138466913735, "learning_rate": 0.00010063083771413975, "loss": 0.5776, "step": 660 }, { "epoch": 3.177570093457944, "grad_norm": 0.13879295451237433, "learning_rate": 9.55855417455723e-05, "loss": 0.5735, "step": 680 }, { "epoch": 3.2710280373831777, "grad_norm": 0.132827192019763, "learning_rate": 9.055148681119688e-05, "loss": 0.5762, "step": 700 }, { "epoch": 3.364485981308411, "grad_norm": 0.135202644806124, "learning_rate": 8.554149169428894e-05, "loss": 0.5738, "step": 720 }, { "epoch": 3.457943925233645, "grad_norm": 0.12535841207061021, "learning_rate": 8.056831391189023e-05, "loss": 0.5703, "step": 740 }, { "epoch": 3.5514018691588785, "grad_norm": 0.13117365043331122, "learning_rate": 7.564461722890081e-05, "loss": 0.5707, "step": 760 }, { "epoch": 3.6448598130841123, "grad_norm": 0.12563061031824993, "learning_rate": 7.078293941090249e-05, "loss": 0.5687, "step": 780 }, { "epoch": 3.7383177570093458, "grad_norm": 0.13424430991092495, "learning_rate": 6.599566029782863e-05, "loss": 0.5717, "step": 800 }, { "epoch": 3.831775700934579, "grad_norm": 0.12859665304631457, "learning_rate": 6.129497027977829e-05, "loss": 0.5694, "step": 820 }, { "epoch": 3.925233644859813, "grad_norm": 0.12403658321797129, "learning_rate": 5.669283925524715e-05, "loss": 0.5696, "step": 840 }, { "epoch": 4.018691588785047, "grad_norm": 0.1241236470560002, "learning_rate": 5.2200986150821696e-05, "loss": 0.5678, "step": 860 }, { "epoch": 4.11214953271028, "grad_norm": 0.13373450432597092, "learning_rate": 4.783084907995156e-05, "loss": 0.559, "step": 880 }, { "epoch": 4.205607476635514, "grad_norm": 0.12955521613965384, "learning_rate": 4.359355621678764e-05, "loss": 0.5626, "step": 900 }, { "epoch": 4.299065420560748, "grad_norm": 0.1201829111448084, "learning_rate": 3.9499897459254375e-05, "loss": 0.5655, "step": 920 }, { "epoch": 4.392523364485982, "grad_norm": 0.1293018490823381, "learning_rate": 3.5560296953512295e-05, "loss": 0.5638, "step": 940 }, { "epoch": 4.485981308411215, "grad_norm": 0.11920916186970969, "learning_rate": 3.178478654977624e-05, "loss": 0.5608, "step": 960 }, { "epoch": 4.579439252336448, "grad_norm": 0.11405851380231317, "learning_rate": 2.818298025708075e-05, "loss": 0.562, "step": 980 }, { "epoch": 4.672897196261682, "grad_norm": 0.12170514209424545, "learning_rate": 2.4764049762041874e-05, "loss": 0.5686, "step": 1000 }, { "epoch": 4.766355140186916, "grad_norm": 0.14556896397750538, "learning_rate": 2.1536701073954558e-05, "loss": 0.5635, "step": 1020 }, { "epoch": 4.859813084112149, "grad_norm": 0.12172194075595427, "learning_rate": 1.8509152355696623e-05, "loss": 0.559, "step": 1040 }, { "epoch": 4.953271028037383, "grad_norm": 0.1318350224140456, "learning_rate": 1.5689112996891576e-05, "loss": 0.5616, "step": 1060 }, { "epoch": 5.046728971962617, "grad_norm": 0.12691480311352626, "learning_rate": 1.3083763982618025e-05, "loss": 0.56, "step": 1080 }, { "epoch": 5.140186915887851, "grad_norm": 0.11296084200322255, "learning_rate": 1.0699739607655435e-05, "loss": 0.5558, "step": 1100 }, { "epoch": 5.233644859813084, "grad_norm": 0.1309044210983197, "learning_rate": 8.543110582829272e-06, "loss": 0.5588, "step": 1120 }, { "epoch": 5.327102803738318, "grad_norm": 0.13260357681059787, "learning_rate": 6.61936857647355e-06, "loss": 0.5605, "step": 1140 }, { "epoch": 5.420560747663552, "grad_norm": 0.12322372236537348, "learning_rate": 4.933412230374812e-06, "loss": 0.5554, "step": 1160 }, { "epoch": 5.5140186915887845, "grad_norm": 0.11617175237743727, "learning_rate": 3.4895346858066724e-06, "loss": 0.5605, "step": 1180 }, { "epoch": 5.607476635514018, "grad_norm": 0.12278895489019458, "learning_rate": 2.291412651418778e-06, "loss": 0.5642, "step": 1200 }, { "epoch": 5.700934579439252, "grad_norm": 0.12252134829919185, "learning_rate": 1.3420970408178913e-06, "loss": 0.5619, "step": 1220 }, { "epoch": 5.794392523364486, "grad_norm": 0.21509033654221743, "learning_rate": 6.440052036815081e-07, "loss": 0.5539, "step": 1240 }, { "epoch": 5.88785046728972, "grad_norm": 0.12357352260583994, "learning_rate": 1.989147701871641e-07, "loss": 0.5559, "step": 1260 }, { "epoch": 5.981308411214953, "grad_norm": 0.12149087465170151, "learning_rate": 7.959124431622389e-09, "loss": 0.5592, "step": 1280 }, { "epoch": 6.0, "step": 1284, "total_flos": 8949072262070272.0, "train_loss": 0.6056221575009118, "train_runtime": 16251.3363, "train_samples_per_second": 5.056, "train_steps_per_second": 0.079 } ], "logging_steps": 20, "max_steps": 1284, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8949072262070272.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }