{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 1182, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.050761421319796954, "grad_norm": 1.9596132040023804, "learning_rate": 4.961928934010153e-05, "loss": 3.0434, "step": 10 }, { "epoch": 0.10152284263959391, "grad_norm": 1.4552912712097168, "learning_rate": 4.919627749576988e-05, "loss": 1.6984, "step": 20 }, { "epoch": 0.15228426395939088, "grad_norm": 1.4020640850067139, "learning_rate": 4.877326565143824e-05, "loss": 1.0141, "step": 30 }, { "epoch": 0.20304568527918782, "grad_norm": 1.3040558099746704, "learning_rate": 4.83502538071066e-05, "loss": 0.6325, "step": 40 }, { "epoch": 0.25380710659898476, "grad_norm": 0.9710696935653687, "learning_rate": 4.792724196277496e-05, "loss": 0.3562, "step": 50 }, { "epoch": 0.30456852791878175, "grad_norm": 0.9529483914375305, "learning_rate": 4.750423011844332e-05, "loss": 0.2681, "step": 60 }, { "epoch": 0.3553299492385787, "grad_norm": 0.9297605156898499, "learning_rate": 4.7081218274111674e-05, "loss": 0.1867, "step": 70 }, { "epoch": 0.40609137055837563, "grad_norm": 0.6723515391349792, "learning_rate": 4.665820642978004e-05, "loss": 0.1557, "step": 80 }, { "epoch": 0.45685279187817257, "grad_norm": 0.5906422734260559, "learning_rate": 4.6235194585448395e-05, "loss": 0.1332, "step": 90 }, { "epoch": 0.5076142131979695, "grad_norm": 0.562096357345581, "learning_rate": 4.5812182741116755e-05, "loss": 0.1113, "step": 100 }, { "epoch": 0.5583756345177665, "grad_norm": 0.6856290102005005, "learning_rate": 4.538917089678511e-05, "loss": 0.0982, "step": 110 }, { "epoch": 0.6091370558375635, "grad_norm": 0.3303697407245636, "learning_rate": 4.496615905245347e-05, "loss": 0.0794, "step": 120 }, { "epoch": 0.6598984771573604, "grad_norm": 0.5941248536109924, "learning_rate": 4.454314720812183e-05, "loss": 0.0799, "step": 130 }, { "epoch": 0.7106598984771574, "grad_norm": 0.46145302057266235, "learning_rate": 4.412013536379019e-05, "loss": 0.0728, "step": 140 }, { "epoch": 0.7614213197969543, "grad_norm": 0.5075628161430359, "learning_rate": 4.369712351945855e-05, "loss": 0.0705, "step": 150 }, { "epoch": 0.8121827411167513, "grad_norm": 0.2965494394302368, "learning_rate": 4.32741116751269e-05, "loss": 0.0634, "step": 160 }, { "epoch": 0.8629441624365483, "grad_norm": 0.3922906219959259, "learning_rate": 4.285109983079527e-05, "loss": 0.0599, "step": 170 }, { "epoch": 0.9137055837563451, "grad_norm": 0.3413899540901184, "learning_rate": 4.242808798646362e-05, "loss": 0.0529, "step": 180 }, { "epoch": 0.9644670050761421, "grad_norm": 0.37600159645080566, "learning_rate": 4.200507614213198e-05, "loss": 0.0548, "step": 190 }, { "epoch": 1.0, "eval_loss": 0.03613473102450371, "eval_runtime": 6.8779, "eval_samples_per_second": 50.887, "eval_steps_per_second": 3.199, "step": 197 }, { "epoch": 1.015228426395939, "grad_norm": 0.4854377508163452, "learning_rate": 4.1582064297800336e-05, "loss": 0.058, "step": 200 }, { "epoch": 1.0659898477157361, "grad_norm": 0.39907264709472656, "learning_rate": 4.1159052453468696e-05, "loss": 0.0609, "step": 210 }, { "epoch": 1.116751269035533, "grad_norm": 0.24890871345996857, "learning_rate": 4.073604060913706e-05, "loss": 0.0473, "step": 220 }, { "epoch": 1.16751269035533, "grad_norm": 0.4353676736354828, "learning_rate": 4.0313028764805416e-05, "loss": 0.0513, "step": 230 }, { "epoch": 1.218274111675127, "grad_norm": 0.38258448243141174, "learning_rate": 3.9890016920473777e-05, "loss": 0.0503, "step": 240 }, { "epoch": 1.2690355329949239, "grad_norm": 0.3302125334739685, "learning_rate": 3.946700507614213e-05, "loss": 0.0478, "step": 250 }, { "epoch": 1.3197969543147208, "grad_norm": 0.401644229888916, "learning_rate": 3.90439932318105e-05, "loss": 0.0457, "step": 260 }, { "epoch": 1.3705583756345177, "grad_norm": 0.31225109100341797, "learning_rate": 3.862098138747885e-05, "loss": 0.0452, "step": 270 }, { "epoch": 1.4213197969543148, "grad_norm": 0.30924656987190247, "learning_rate": 3.819796954314721e-05, "loss": 0.0428, "step": 280 }, { "epoch": 1.4720812182741116, "grad_norm": 0.543154239654541, "learning_rate": 3.7774957698815564e-05, "loss": 0.048, "step": 290 }, { "epoch": 1.5228426395939088, "grad_norm": 0.2982091009616852, "learning_rate": 3.735194585448393e-05, "loss": 0.0427, "step": 300 }, { "epoch": 1.5736040609137056, "grad_norm": 0.3622360825538635, "learning_rate": 3.692893401015229e-05, "loss": 0.0431, "step": 310 }, { "epoch": 1.6243654822335025, "grad_norm": 0.2379499226808548, "learning_rate": 3.6505922165820644e-05, "loss": 0.0408, "step": 320 }, { "epoch": 1.6751269035532994, "grad_norm": 0.2724953889846802, "learning_rate": 3.6082910321489004e-05, "loss": 0.0419, "step": 330 }, { "epoch": 1.7258883248730963, "grad_norm": 0.21542227268218994, "learning_rate": 3.565989847715736e-05, "loss": 0.0439, "step": 340 }, { "epoch": 1.7766497461928934, "grad_norm": 0.24891333281993866, "learning_rate": 3.5236886632825724e-05, "loss": 0.0393, "step": 350 }, { "epoch": 1.8274111675126905, "grad_norm": 0.18472662568092346, "learning_rate": 3.481387478849408e-05, "loss": 0.0372, "step": 360 }, { "epoch": 1.8781725888324874, "grad_norm": 0.1834375113248825, "learning_rate": 3.439086294416244e-05, "loss": 0.0383, "step": 370 }, { "epoch": 1.9289340101522843, "grad_norm": 0.26916465163230896, "learning_rate": 3.396785109983079e-05, "loss": 0.0419, "step": 380 }, { "epoch": 1.9796954314720812, "grad_norm": 0.2296602427959442, "learning_rate": 3.354483925549916e-05, "loss": 0.0391, "step": 390 }, { "epoch": 2.0, "eval_loss": 0.029596656560897827, "eval_runtime": 6.9466, "eval_samples_per_second": 50.384, "eval_steps_per_second": 3.167, "step": 394 }, { "epoch": 2.030456852791878, "grad_norm": 0.22394953668117523, "learning_rate": 3.312182741116752e-05, "loss": 0.0368, "step": 400 }, { "epoch": 2.081218274111675, "grad_norm": 0.24742868542671204, "learning_rate": 3.269881556683587e-05, "loss": 0.0417, "step": 410 }, { "epoch": 2.1319796954314723, "grad_norm": 0.17821934819221497, "learning_rate": 3.227580372250423e-05, "loss": 0.039, "step": 420 }, { "epoch": 2.182741116751269, "grad_norm": 0.17562909424304962, "learning_rate": 3.185279187817259e-05, "loss": 0.036, "step": 430 }, { "epoch": 2.233502538071066, "grad_norm": 0.24495473504066467, "learning_rate": 3.142978003384095e-05, "loss": 0.038, "step": 440 }, { "epoch": 2.284263959390863, "grad_norm": 0.21984700858592987, "learning_rate": 3.1006768189509306e-05, "loss": 0.0364, "step": 450 }, { "epoch": 2.33502538071066, "grad_norm": 0.263046532869339, "learning_rate": 3.0583756345177666e-05, "loss": 0.0393, "step": 460 }, { "epoch": 2.3857868020304567, "grad_norm": 0.494204044342041, "learning_rate": 3.016074450084603e-05, "loss": 0.0342, "step": 470 }, { "epoch": 2.436548223350254, "grad_norm": 0.24457719922065735, "learning_rate": 2.9737732656514383e-05, "loss": 0.0371, "step": 480 }, { "epoch": 2.487309644670051, "grad_norm": 0.2866905629634857, "learning_rate": 2.9314720812182743e-05, "loss": 0.0375, "step": 490 }, { "epoch": 2.5380710659898478, "grad_norm": 0.1922035664319992, "learning_rate": 2.88917089678511e-05, "loss": 0.0339, "step": 500 }, { "epoch": 2.5888324873096447, "grad_norm": 0.2251596301794052, "learning_rate": 2.846869712351946e-05, "loss": 0.0316, "step": 510 }, { "epoch": 2.6395939086294415, "grad_norm": 0.19956769049167633, "learning_rate": 2.8045685279187816e-05, "loss": 0.0367, "step": 520 }, { "epoch": 2.6903553299492384, "grad_norm": 0.23161649703979492, "learning_rate": 2.7622673434856176e-05, "loss": 0.0335, "step": 530 }, { "epoch": 2.7411167512690353, "grad_norm": 0.2735691964626312, "learning_rate": 2.7199661590524533e-05, "loss": 0.0367, "step": 540 }, { "epoch": 2.7918781725888326, "grad_norm": 0.3856474757194519, "learning_rate": 2.6776649746192893e-05, "loss": 0.0362, "step": 550 }, { "epoch": 2.8426395939086295, "grad_norm": 0.24519683420658112, "learning_rate": 2.6353637901861257e-05, "loss": 0.031, "step": 560 }, { "epoch": 2.8934010152284264, "grad_norm": 0.12949654459953308, "learning_rate": 2.593062605752961e-05, "loss": 0.032, "step": 570 }, { "epoch": 2.9441624365482233, "grad_norm": 0.1476690173149109, "learning_rate": 2.5507614213197974e-05, "loss": 0.0351, "step": 580 }, { "epoch": 2.99492385786802, "grad_norm": 0.24033169448375702, "learning_rate": 2.5084602368866327e-05, "loss": 0.0337, "step": 590 }, { "epoch": 3.0, "eval_loss": 0.02790662832558155, "eval_runtime": 6.9258, "eval_samples_per_second": 50.536, "eval_steps_per_second": 3.177, "step": 591 }, { "epoch": 3.045685279187817, "grad_norm": 0.25604188442230225, "learning_rate": 2.466159052453469e-05, "loss": 0.0339, "step": 600 }, { "epoch": 3.0964467005076144, "grad_norm": 0.15198302268981934, "learning_rate": 2.4238578680203047e-05, "loss": 0.0316, "step": 610 }, { "epoch": 3.1472081218274113, "grad_norm": 0.18943068385124207, "learning_rate": 2.3815566835871404e-05, "loss": 0.0302, "step": 620 }, { "epoch": 3.197969543147208, "grad_norm": 0.23807291686534882, "learning_rate": 2.3392554991539764e-05, "loss": 0.0338, "step": 630 }, { "epoch": 3.248730964467005, "grad_norm": 0.2615777552127838, "learning_rate": 2.296954314720812e-05, "loss": 0.0291, "step": 640 }, { "epoch": 3.299492385786802, "grad_norm": 0.20456817746162415, "learning_rate": 2.254653130287648e-05, "loss": 0.0331, "step": 650 }, { "epoch": 3.350253807106599, "grad_norm": 0.29629555344581604, "learning_rate": 2.2123519458544838e-05, "loss": 0.0324, "step": 660 }, { "epoch": 3.401015228426396, "grad_norm": 0.19070571660995483, "learning_rate": 2.17005076142132e-05, "loss": 0.0312, "step": 670 }, { "epoch": 3.451776649746193, "grad_norm": 0.17927491664886475, "learning_rate": 2.1277495769881558e-05, "loss": 0.0331, "step": 680 }, { "epoch": 3.50253807106599, "grad_norm": 0.16211186349391937, "learning_rate": 2.085448392554992e-05, "loss": 0.0324, "step": 690 }, { "epoch": 3.553299492385787, "grad_norm": 0.13928809762001038, "learning_rate": 2.0431472081218275e-05, "loss": 0.0315, "step": 700 }, { "epoch": 3.6040609137055837, "grad_norm": 0.2813867926597595, "learning_rate": 2.0008460236886635e-05, "loss": 0.03, "step": 710 }, { "epoch": 3.6548223350253806, "grad_norm": 0.2689349353313446, "learning_rate": 1.9585448392554992e-05, "loss": 0.0333, "step": 720 }, { "epoch": 3.7055837563451774, "grad_norm": 0.2879869043827057, "learning_rate": 1.916243654822335e-05, "loss": 0.035, "step": 730 }, { "epoch": 3.7563451776649748, "grad_norm": 0.1638893336057663, "learning_rate": 1.873942470389171e-05, "loss": 0.0331, "step": 740 }, { "epoch": 3.8071065989847717, "grad_norm": 0.08905433863401413, "learning_rate": 1.831641285956007e-05, "loss": 0.0291, "step": 750 }, { "epoch": 3.8578680203045685, "grad_norm": 0.2221483290195465, "learning_rate": 1.789340101522843e-05, "loss": 0.0334, "step": 760 }, { "epoch": 3.9086294416243654, "grad_norm": 0.16910187900066376, "learning_rate": 1.7470389170896786e-05, "loss": 0.0331, "step": 770 }, { "epoch": 3.9593908629441623, "grad_norm": 0.20653125643730164, "learning_rate": 1.7047377326565146e-05, "loss": 0.0328, "step": 780 }, { "epoch": 4.0, "eval_loss": 0.02677118219435215, "eval_runtime": 6.9163, "eval_samples_per_second": 50.605, "eval_steps_per_second": 3.181, "step": 788 }, { "epoch": 4.01015228426396, "grad_norm": 0.5460578203201294, "learning_rate": 1.6624365482233503e-05, "loss": 0.0317, "step": 790 }, { "epoch": 4.060913705583756, "grad_norm": 0.1273794323205948, "learning_rate": 1.6201353637901863e-05, "loss": 0.0324, "step": 800 }, { "epoch": 4.111675126903553, "grad_norm": 0.2069994956254959, "learning_rate": 1.577834179357022e-05, "loss": 0.0328, "step": 810 }, { "epoch": 4.16243654822335, "grad_norm": 0.13560791313648224, "learning_rate": 1.535532994923858e-05, "loss": 0.0289, "step": 820 }, { "epoch": 4.213197969543147, "grad_norm": 0.13835355639457703, "learning_rate": 1.493231810490694e-05, "loss": 0.0285, "step": 830 }, { "epoch": 4.2639593908629445, "grad_norm": 0.17146103084087372, "learning_rate": 1.4509306260575298e-05, "loss": 0.0328, "step": 840 }, { "epoch": 4.314720812182741, "grad_norm": 0.25955504179000854, "learning_rate": 1.4086294416243657e-05, "loss": 0.0295, "step": 850 }, { "epoch": 4.365482233502538, "grad_norm": 0.24718697369098663, "learning_rate": 1.3663282571912014e-05, "loss": 0.0307, "step": 860 }, { "epoch": 4.416243654822335, "grad_norm": 0.12164635211229324, "learning_rate": 1.3240270727580372e-05, "loss": 0.0287, "step": 870 }, { "epoch": 4.467005076142132, "grad_norm": 0.17382808029651642, "learning_rate": 1.281725888324873e-05, "loss": 0.0472, "step": 880 }, { "epoch": 4.517766497461929, "grad_norm": 0.17402203381061554, "learning_rate": 1.239424703891709e-05, "loss": 0.0343, "step": 890 }, { "epoch": 4.568527918781726, "grad_norm": 0.17245104908943176, "learning_rate": 1.1971235194585449e-05, "loss": 0.0318, "step": 900 }, { "epoch": 4.619289340101523, "grad_norm": 0.1376132220029831, "learning_rate": 1.1548223350253808e-05, "loss": 0.0319, "step": 910 }, { "epoch": 4.67005076142132, "grad_norm": 0.17528069019317627, "learning_rate": 1.1125211505922166e-05, "loss": 0.0302, "step": 920 }, { "epoch": 4.720812182741117, "grad_norm": 0.2443544864654541, "learning_rate": 1.0702199661590526e-05, "loss": 0.0295, "step": 930 }, { "epoch": 4.771573604060913, "grad_norm": 0.21152476966381073, "learning_rate": 1.0279187817258885e-05, "loss": 0.0331, "step": 940 }, { "epoch": 4.822335025380711, "grad_norm": 0.13216163218021393, "learning_rate": 9.856175972927243e-06, "loss": 0.0283, "step": 950 }, { "epoch": 4.873096446700508, "grad_norm": 0.1937057226896286, "learning_rate": 9.433164128595601e-06, "loss": 0.0285, "step": 960 }, { "epoch": 4.9238578680203045, "grad_norm": 0.1196654811501503, "learning_rate": 9.01015228426396e-06, "loss": 0.0299, "step": 970 }, { "epoch": 4.974619289340102, "grad_norm": 0.14108304679393768, "learning_rate": 8.587140439932318e-06, "loss": 0.0326, "step": 980 }, { "epoch": 5.0, "eval_loss": 0.026293369010090828, "eval_runtime": 7.2804, "eval_samples_per_second": 48.074, "eval_steps_per_second": 3.022, "step": 985 }, { "epoch": 5.025380710659898, "grad_norm": 0.11325781047344208, "learning_rate": 8.164128595600677e-06, "loss": 0.0303, "step": 990 }, { "epoch": 5.0761421319796955, "grad_norm": 0.1742030531167984, "learning_rate": 7.741116751269035e-06, "loss": 0.029, "step": 1000 }, { "epoch": 5.126903553299492, "grad_norm": 0.19924026727676392, "learning_rate": 7.318104906937395e-06, "loss": 0.0271, "step": 1010 }, { "epoch": 5.177664974619289, "grad_norm": 0.23700544238090515, "learning_rate": 6.895093062605754e-06, "loss": 0.0306, "step": 1020 }, { "epoch": 5.228426395939087, "grad_norm": 0.12165335565805435, "learning_rate": 6.472081218274112e-06, "loss": 0.0318, "step": 1030 }, { "epoch": 5.279187817258883, "grad_norm": 0.21364423632621765, "learning_rate": 6.049069373942471e-06, "loss": 0.03, "step": 1040 }, { "epoch": 5.32994923857868, "grad_norm": 0.19045327603816986, "learning_rate": 5.626057529610829e-06, "loss": 0.0325, "step": 1050 }, { "epoch": 5.380710659898477, "grad_norm": 0.10052906721830368, "learning_rate": 5.203045685279188e-06, "loss": 0.0278, "step": 1060 }, { "epoch": 5.431472081218274, "grad_norm": 0.2044578194618225, "learning_rate": 4.780033840947547e-06, "loss": 0.0309, "step": 1070 }, { "epoch": 5.482233502538071, "grad_norm": 0.19502834975719452, "learning_rate": 4.357021996615906e-06, "loss": 0.0318, "step": 1080 }, { "epoch": 5.532994923857868, "grad_norm": 0.13834765553474426, "learning_rate": 3.934010152284264e-06, "loss": 0.0305, "step": 1090 }, { "epoch": 5.583756345177665, "grad_norm": 0.19017720222473145, "learning_rate": 3.5109983079526226e-06, "loss": 0.0305, "step": 1100 }, { "epoch": 5.634517766497462, "grad_norm": 0.14318296313285828, "learning_rate": 3.0879864636209815e-06, "loss": 0.0304, "step": 1110 }, { "epoch": 5.685279187817259, "grad_norm": 0.13694196939468384, "learning_rate": 2.6649746192893404e-06, "loss": 0.0301, "step": 1120 }, { "epoch": 5.7360406091370555, "grad_norm": 0.11877632886171341, "learning_rate": 2.241962774957699e-06, "loss": 0.0303, "step": 1130 }, { "epoch": 5.786802030456853, "grad_norm": 0.1271430402994156, "learning_rate": 1.8189509306260577e-06, "loss": 0.0321, "step": 1140 }, { "epoch": 5.837563451776649, "grad_norm": 0.1693529337644577, "learning_rate": 1.3959390862944163e-06, "loss": 0.0318, "step": 1150 }, { "epoch": 5.888324873096447, "grad_norm": 0.1400621086359024, "learning_rate": 9.72927241962775e-07, "loss": 0.0273, "step": 1160 }, { "epoch": 5.939086294416244, "grad_norm": 0.20201422274112701, "learning_rate": 5.499153976311337e-07, "loss": 0.0287, "step": 1170 }, { "epoch": 5.98984771573604, "grad_norm": 0.12207765877246857, "learning_rate": 1.2690355329949238e-07, "loss": 0.0313, "step": 1180 }, { "epoch": 6.0, "eval_loss": 0.02633434534072876, "eval_runtime": 6.8922, "eval_samples_per_second": 50.782, "eval_steps_per_second": 3.192, "step": 1182 } ], "logging_steps": 10, "max_steps": 1182, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2581201228677120.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }