| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.0, | |
| "eval_steps": 500, | |
| "global_step": 1182, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.050761421319796954, | |
| "grad_norm": 1.9596132040023804, | |
| "learning_rate": 4.961928934010153e-05, | |
| "loss": 3.0434, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10152284263959391, | |
| "grad_norm": 1.4552912712097168, | |
| "learning_rate": 4.919627749576988e-05, | |
| "loss": 1.6984, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.15228426395939088, | |
| "grad_norm": 1.4020640850067139, | |
| "learning_rate": 4.877326565143824e-05, | |
| "loss": 1.0141, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.20304568527918782, | |
| "grad_norm": 1.3040558099746704, | |
| "learning_rate": 4.83502538071066e-05, | |
| "loss": 0.6325, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.25380710659898476, | |
| "grad_norm": 0.9710696935653687, | |
| "learning_rate": 4.792724196277496e-05, | |
| "loss": 0.3562, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.30456852791878175, | |
| "grad_norm": 0.9529483914375305, | |
| "learning_rate": 4.750423011844332e-05, | |
| "loss": 0.2681, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3553299492385787, | |
| "grad_norm": 0.9297605156898499, | |
| "learning_rate": 4.7081218274111674e-05, | |
| "loss": 0.1867, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.40609137055837563, | |
| "grad_norm": 0.6723515391349792, | |
| "learning_rate": 4.665820642978004e-05, | |
| "loss": 0.1557, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.45685279187817257, | |
| "grad_norm": 0.5906422734260559, | |
| "learning_rate": 4.6235194585448395e-05, | |
| "loss": 0.1332, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5076142131979695, | |
| "grad_norm": 0.562096357345581, | |
| "learning_rate": 4.5812182741116755e-05, | |
| "loss": 0.1113, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5583756345177665, | |
| "grad_norm": 0.6856290102005005, | |
| "learning_rate": 4.538917089678511e-05, | |
| "loss": 0.0982, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6091370558375635, | |
| "grad_norm": 0.3303697407245636, | |
| "learning_rate": 4.496615905245347e-05, | |
| "loss": 0.0794, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.6598984771573604, | |
| "grad_norm": 0.5941248536109924, | |
| "learning_rate": 4.454314720812183e-05, | |
| "loss": 0.0799, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7106598984771574, | |
| "grad_norm": 0.46145302057266235, | |
| "learning_rate": 4.412013536379019e-05, | |
| "loss": 0.0728, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7614213197969543, | |
| "grad_norm": 0.5075628161430359, | |
| "learning_rate": 4.369712351945855e-05, | |
| "loss": 0.0705, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8121827411167513, | |
| "grad_norm": 0.2965494394302368, | |
| "learning_rate": 4.32741116751269e-05, | |
| "loss": 0.0634, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8629441624365483, | |
| "grad_norm": 0.3922906219959259, | |
| "learning_rate": 4.285109983079527e-05, | |
| "loss": 0.0599, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9137055837563451, | |
| "grad_norm": 0.3413899540901184, | |
| "learning_rate": 4.242808798646362e-05, | |
| "loss": 0.0529, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.9644670050761421, | |
| "grad_norm": 0.37600159645080566, | |
| "learning_rate": 4.200507614213198e-05, | |
| "loss": 0.0548, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.03613473102450371, | |
| "eval_runtime": 6.8779, | |
| "eval_samples_per_second": 50.887, | |
| "eval_steps_per_second": 3.199, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.015228426395939, | |
| "grad_norm": 0.4854377508163452, | |
| "learning_rate": 4.1582064297800336e-05, | |
| "loss": 0.058, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0659898477157361, | |
| "grad_norm": 0.39907264709472656, | |
| "learning_rate": 4.1159052453468696e-05, | |
| "loss": 0.0609, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.116751269035533, | |
| "grad_norm": 0.24890871345996857, | |
| "learning_rate": 4.073604060913706e-05, | |
| "loss": 0.0473, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.16751269035533, | |
| "grad_norm": 0.4353676736354828, | |
| "learning_rate": 4.0313028764805416e-05, | |
| "loss": 0.0513, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.218274111675127, | |
| "grad_norm": 0.38258448243141174, | |
| "learning_rate": 3.9890016920473777e-05, | |
| "loss": 0.0503, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.2690355329949239, | |
| "grad_norm": 0.3302125334739685, | |
| "learning_rate": 3.946700507614213e-05, | |
| "loss": 0.0478, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.3197969543147208, | |
| "grad_norm": 0.401644229888916, | |
| "learning_rate": 3.90439932318105e-05, | |
| "loss": 0.0457, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.3705583756345177, | |
| "grad_norm": 0.31225109100341797, | |
| "learning_rate": 3.862098138747885e-05, | |
| "loss": 0.0452, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.4213197969543148, | |
| "grad_norm": 0.30924656987190247, | |
| "learning_rate": 3.819796954314721e-05, | |
| "loss": 0.0428, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.4720812182741116, | |
| "grad_norm": 0.543154239654541, | |
| "learning_rate": 3.7774957698815564e-05, | |
| "loss": 0.048, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.5228426395939088, | |
| "grad_norm": 0.2982091009616852, | |
| "learning_rate": 3.735194585448393e-05, | |
| "loss": 0.0427, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.5736040609137056, | |
| "grad_norm": 0.3622360825538635, | |
| "learning_rate": 3.692893401015229e-05, | |
| "loss": 0.0431, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.6243654822335025, | |
| "grad_norm": 0.2379499226808548, | |
| "learning_rate": 3.6505922165820644e-05, | |
| "loss": 0.0408, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.6751269035532994, | |
| "grad_norm": 0.2724953889846802, | |
| "learning_rate": 3.6082910321489004e-05, | |
| "loss": 0.0419, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.7258883248730963, | |
| "grad_norm": 0.21542227268218994, | |
| "learning_rate": 3.565989847715736e-05, | |
| "loss": 0.0439, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.7766497461928934, | |
| "grad_norm": 0.24891333281993866, | |
| "learning_rate": 3.5236886632825724e-05, | |
| "loss": 0.0393, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.8274111675126905, | |
| "grad_norm": 0.18472662568092346, | |
| "learning_rate": 3.481387478849408e-05, | |
| "loss": 0.0372, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.8781725888324874, | |
| "grad_norm": 0.1834375113248825, | |
| "learning_rate": 3.439086294416244e-05, | |
| "loss": 0.0383, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.9289340101522843, | |
| "grad_norm": 0.26916465163230896, | |
| "learning_rate": 3.396785109983079e-05, | |
| "loss": 0.0419, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.9796954314720812, | |
| "grad_norm": 0.2296602427959442, | |
| "learning_rate": 3.354483925549916e-05, | |
| "loss": 0.0391, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.029596656560897827, | |
| "eval_runtime": 6.9466, | |
| "eval_samples_per_second": 50.384, | |
| "eval_steps_per_second": 3.167, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 2.030456852791878, | |
| "grad_norm": 0.22394953668117523, | |
| "learning_rate": 3.312182741116752e-05, | |
| "loss": 0.0368, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.081218274111675, | |
| "grad_norm": 0.24742868542671204, | |
| "learning_rate": 3.269881556683587e-05, | |
| "loss": 0.0417, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.1319796954314723, | |
| "grad_norm": 0.17821934819221497, | |
| "learning_rate": 3.227580372250423e-05, | |
| "loss": 0.039, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.182741116751269, | |
| "grad_norm": 0.17562909424304962, | |
| "learning_rate": 3.185279187817259e-05, | |
| "loss": 0.036, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.233502538071066, | |
| "grad_norm": 0.24495473504066467, | |
| "learning_rate": 3.142978003384095e-05, | |
| "loss": 0.038, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.284263959390863, | |
| "grad_norm": 0.21984700858592987, | |
| "learning_rate": 3.1006768189509306e-05, | |
| "loss": 0.0364, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.33502538071066, | |
| "grad_norm": 0.263046532869339, | |
| "learning_rate": 3.0583756345177666e-05, | |
| "loss": 0.0393, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.3857868020304567, | |
| "grad_norm": 0.494204044342041, | |
| "learning_rate": 3.016074450084603e-05, | |
| "loss": 0.0342, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.436548223350254, | |
| "grad_norm": 0.24457719922065735, | |
| "learning_rate": 2.9737732656514383e-05, | |
| "loss": 0.0371, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.487309644670051, | |
| "grad_norm": 0.2866905629634857, | |
| "learning_rate": 2.9314720812182743e-05, | |
| "loss": 0.0375, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.5380710659898478, | |
| "grad_norm": 0.1922035664319992, | |
| "learning_rate": 2.88917089678511e-05, | |
| "loss": 0.0339, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.5888324873096447, | |
| "grad_norm": 0.2251596301794052, | |
| "learning_rate": 2.846869712351946e-05, | |
| "loss": 0.0316, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.6395939086294415, | |
| "grad_norm": 0.19956769049167633, | |
| "learning_rate": 2.8045685279187816e-05, | |
| "loss": 0.0367, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.6903553299492384, | |
| "grad_norm": 0.23161649703979492, | |
| "learning_rate": 2.7622673434856176e-05, | |
| "loss": 0.0335, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.7411167512690353, | |
| "grad_norm": 0.2735691964626312, | |
| "learning_rate": 2.7199661590524533e-05, | |
| "loss": 0.0367, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.7918781725888326, | |
| "grad_norm": 0.3856474757194519, | |
| "learning_rate": 2.6776649746192893e-05, | |
| "loss": 0.0362, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.8426395939086295, | |
| "grad_norm": 0.24519683420658112, | |
| "learning_rate": 2.6353637901861257e-05, | |
| "loss": 0.031, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.8934010152284264, | |
| "grad_norm": 0.12949654459953308, | |
| "learning_rate": 2.593062605752961e-05, | |
| "loss": 0.032, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.9441624365482233, | |
| "grad_norm": 0.1476690173149109, | |
| "learning_rate": 2.5507614213197974e-05, | |
| "loss": 0.0351, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.99492385786802, | |
| "grad_norm": 0.24033169448375702, | |
| "learning_rate": 2.5084602368866327e-05, | |
| "loss": 0.0337, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.02790662832558155, | |
| "eval_runtime": 6.9258, | |
| "eval_samples_per_second": 50.536, | |
| "eval_steps_per_second": 3.177, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 3.045685279187817, | |
| "grad_norm": 0.25604188442230225, | |
| "learning_rate": 2.466159052453469e-05, | |
| "loss": 0.0339, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.0964467005076144, | |
| "grad_norm": 0.15198302268981934, | |
| "learning_rate": 2.4238578680203047e-05, | |
| "loss": 0.0316, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 3.1472081218274113, | |
| "grad_norm": 0.18943068385124207, | |
| "learning_rate": 2.3815566835871404e-05, | |
| "loss": 0.0302, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.197969543147208, | |
| "grad_norm": 0.23807291686534882, | |
| "learning_rate": 2.3392554991539764e-05, | |
| "loss": 0.0338, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 3.248730964467005, | |
| "grad_norm": 0.2615777552127838, | |
| "learning_rate": 2.296954314720812e-05, | |
| "loss": 0.0291, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.299492385786802, | |
| "grad_norm": 0.20456817746162415, | |
| "learning_rate": 2.254653130287648e-05, | |
| "loss": 0.0331, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.350253807106599, | |
| "grad_norm": 0.29629555344581604, | |
| "learning_rate": 2.2123519458544838e-05, | |
| "loss": 0.0324, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 3.401015228426396, | |
| "grad_norm": 0.19070571660995483, | |
| "learning_rate": 2.17005076142132e-05, | |
| "loss": 0.0312, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 3.451776649746193, | |
| "grad_norm": 0.17927491664886475, | |
| "learning_rate": 2.1277495769881558e-05, | |
| "loss": 0.0331, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.50253807106599, | |
| "grad_norm": 0.16211186349391937, | |
| "learning_rate": 2.085448392554992e-05, | |
| "loss": 0.0324, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 3.553299492385787, | |
| "grad_norm": 0.13928809762001038, | |
| "learning_rate": 2.0431472081218275e-05, | |
| "loss": 0.0315, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.6040609137055837, | |
| "grad_norm": 0.2813867926597595, | |
| "learning_rate": 2.0008460236886635e-05, | |
| "loss": 0.03, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 3.6548223350253806, | |
| "grad_norm": 0.2689349353313446, | |
| "learning_rate": 1.9585448392554992e-05, | |
| "loss": 0.0333, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 3.7055837563451774, | |
| "grad_norm": 0.2879869043827057, | |
| "learning_rate": 1.916243654822335e-05, | |
| "loss": 0.035, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 3.7563451776649748, | |
| "grad_norm": 0.1638893336057663, | |
| "learning_rate": 1.873942470389171e-05, | |
| "loss": 0.0331, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 3.8071065989847717, | |
| "grad_norm": 0.08905433863401413, | |
| "learning_rate": 1.831641285956007e-05, | |
| "loss": 0.0291, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 3.8578680203045685, | |
| "grad_norm": 0.2221483290195465, | |
| "learning_rate": 1.789340101522843e-05, | |
| "loss": 0.0334, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 3.9086294416243654, | |
| "grad_norm": 0.16910187900066376, | |
| "learning_rate": 1.7470389170896786e-05, | |
| "loss": 0.0331, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 3.9593908629441623, | |
| "grad_norm": 0.20653125643730164, | |
| "learning_rate": 1.7047377326565146e-05, | |
| "loss": 0.0328, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.02677118219435215, | |
| "eval_runtime": 6.9163, | |
| "eval_samples_per_second": 50.605, | |
| "eval_steps_per_second": 3.181, | |
| "step": 788 | |
| }, | |
| { | |
| "epoch": 4.01015228426396, | |
| "grad_norm": 0.5460578203201294, | |
| "learning_rate": 1.6624365482233503e-05, | |
| "loss": 0.0317, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 4.060913705583756, | |
| "grad_norm": 0.1273794323205948, | |
| "learning_rate": 1.6201353637901863e-05, | |
| "loss": 0.0324, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.111675126903553, | |
| "grad_norm": 0.2069994956254959, | |
| "learning_rate": 1.577834179357022e-05, | |
| "loss": 0.0328, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 4.16243654822335, | |
| "grad_norm": 0.13560791313648224, | |
| "learning_rate": 1.535532994923858e-05, | |
| "loss": 0.0289, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 4.213197969543147, | |
| "grad_norm": 0.13835355639457703, | |
| "learning_rate": 1.493231810490694e-05, | |
| "loss": 0.0285, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 4.2639593908629445, | |
| "grad_norm": 0.17146103084087372, | |
| "learning_rate": 1.4509306260575298e-05, | |
| "loss": 0.0328, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 4.314720812182741, | |
| "grad_norm": 0.25955504179000854, | |
| "learning_rate": 1.4086294416243657e-05, | |
| "loss": 0.0295, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 4.365482233502538, | |
| "grad_norm": 0.24718697369098663, | |
| "learning_rate": 1.3663282571912014e-05, | |
| "loss": 0.0307, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 4.416243654822335, | |
| "grad_norm": 0.12164635211229324, | |
| "learning_rate": 1.3240270727580372e-05, | |
| "loss": 0.0287, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 4.467005076142132, | |
| "grad_norm": 0.17382808029651642, | |
| "learning_rate": 1.281725888324873e-05, | |
| "loss": 0.0472, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 4.517766497461929, | |
| "grad_norm": 0.17402203381061554, | |
| "learning_rate": 1.239424703891709e-05, | |
| "loss": 0.0343, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 4.568527918781726, | |
| "grad_norm": 0.17245104908943176, | |
| "learning_rate": 1.1971235194585449e-05, | |
| "loss": 0.0318, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.619289340101523, | |
| "grad_norm": 0.1376132220029831, | |
| "learning_rate": 1.1548223350253808e-05, | |
| "loss": 0.0319, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 4.67005076142132, | |
| "grad_norm": 0.17528069019317627, | |
| "learning_rate": 1.1125211505922166e-05, | |
| "loss": 0.0302, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 4.720812182741117, | |
| "grad_norm": 0.2443544864654541, | |
| "learning_rate": 1.0702199661590526e-05, | |
| "loss": 0.0295, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 4.771573604060913, | |
| "grad_norm": 0.21152476966381073, | |
| "learning_rate": 1.0279187817258885e-05, | |
| "loss": 0.0331, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 4.822335025380711, | |
| "grad_norm": 0.13216163218021393, | |
| "learning_rate": 9.856175972927243e-06, | |
| "loss": 0.0283, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 4.873096446700508, | |
| "grad_norm": 0.1937057226896286, | |
| "learning_rate": 9.433164128595601e-06, | |
| "loss": 0.0285, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 4.9238578680203045, | |
| "grad_norm": 0.1196654811501503, | |
| "learning_rate": 9.01015228426396e-06, | |
| "loss": 0.0299, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 4.974619289340102, | |
| "grad_norm": 0.14108304679393768, | |
| "learning_rate": 8.587140439932318e-06, | |
| "loss": 0.0326, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.026293369010090828, | |
| "eval_runtime": 7.2804, | |
| "eval_samples_per_second": 48.074, | |
| "eval_steps_per_second": 3.022, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 5.025380710659898, | |
| "grad_norm": 0.11325781047344208, | |
| "learning_rate": 8.164128595600677e-06, | |
| "loss": 0.0303, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 5.0761421319796955, | |
| "grad_norm": 0.1742030531167984, | |
| "learning_rate": 7.741116751269035e-06, | |
| "loss": 0.029, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.126903553299492, | |
| "grad_norm": 0.19924026727676392, | |
| "learning_rate": 7.318104906937395e-06, | |
| "loss": 0.0271, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 5.177664974619289, | |
| "grad_norm": 0.23700544238090515, | |
| "learning_rate": 6.895093062605754e-06, | |
| "loss": 0.0306, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 5.228426395939087, | |
| "grad_norm": 0.12165335565805435, | |
| "learning_rate": 6.472081218274112e-06, | |
| "loss": 0.0318, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 5.279187817258883, | |
| "grad_norm": 0.21364423632621765, | |
| "learning_rate": 6.049069373942471e-06, | |
| "loss": 0.03, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 5.32994923857868, | |
| "grad_norm": 0.19045327603816986, | |
| "learning_rate": 5.626057529610829e-06, | |
| "loss": 0.0325, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 5.380710659898477, | |
| "grad_norm": 0.10052906721830368, | |
| "learning_rate": 5.203045685279188e-06, | |
| "loss": 0.0278, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 5.431472081218274, | |
| "grad_norm": 0.2044578194618225, | |
| "learning_rate": 4.780033840947547e-06, | |
| "loss": 0.0309, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 5.482233502538071, | |
| "grad_norm": 0.19502834975719452, | |
| "learning_rate": 4.357021996615906e-06, | |
| "loss": 0.0318, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 5.532994923857868, | |
| "grad_norm": 0.13834765553474426, | |
| "learning_rate": 3.934010152284264e-06, | |
| "loss": 0.0305, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 5.583756345177665, | |
| "grad_norm": 0.19017720222473145, | |
| "learning_rate": 3.5109983079526226e-06, | |
| "loss": 0.0305, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 5.634517766497462, | |
| "grad_norm": 0.14318296313285828, | |
| "learning_rate": 3.0879864636209815e-06, | |
| "loss": 0.0304, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 5.685279187817259, | |
| "grad_norm": 0.13694196939468384, | |
| "learning_rate": 2.6649746192893404e-06, | |
| "loss": 0.0301, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 5.7360406091370555, | |
| "grad_norm": 0.11877632886171341, | |
| "learning_rate": 2.241962774957699e-06, | |
| "loss": 0.0303, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 5.786802030456853, | |
| "grad_norm": 0.1271430402994156, | |
| "learning_rate": 1.8189509306260577e-06, | |
| "loss": 0.0321, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 5.837563451776649, | |
| "grad_norm": 0.1693529337644577, | |
| "learning_rate": 1.3959390862944163e-06, | |
| "loss": 0.0318, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 5.888324873096447, | |
| "grad_norm": 0.1400621086359024, | |
| "learning_rate": 9.72927241962775e-07, | |
| "loss": 0.0273, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 5.939086294416244, | |
| "grad_norm": 0.20201422274112701, | |
| "learning_rate": 5.499153976311337e-07, | |
| "loss": 0.0287, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 5.98984771573604, | |
| "grad_norm": 0.12207765877246857, | |
| "learning_rate": 1.2690355329949238e-07, | |
| "loss": 0.0313, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.02633434534072876, | |
| "eval_runtime": 6.8922, | |
| "eval_samples_per_second": 50.782, | |
| "eval_steps_per_second": 3.192, | |
| "step": 1182 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1182, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2581201228677120.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |