| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 2292, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004363953742090334, | |
| "grad_norm": 26.499397821507202, | |
| "learning_rate": 3.91304347826087e-07, | |
| "loss": 1.5146, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.008727907484180668, | |
| "grad_norm": 18.696221795032976, | |
| "learning_rate": 8.260869565217392e-07, | |
| "loss": 1.3975, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.013091861226271002, | |
| "grad_norm": 5.195746812383845, | |
| "learning_rate": 1.2608695652173913e-06, | |
| "loss": 1.0301, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.017455814968361336, | |
| "grad_norm": 2.9388986813240305, | |
| "learning_rate": 1.6956521739130435e-06, | |
| "loss": 0.9338, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02181976871045167, | |
| "grad_norm": 2.157896980678217, | |
| "learning_rate": 2.130434782608696e-06, | |
| "loss": 0.834, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.026183722452542003, | |
| "grad_norm": 2.030356833343019, | |
| "learning_rate": 2.5652173913043484e-06, | |
| "loss": 0.7817, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.030547676194632337, | |
| "grad_norm": 2.010110703082707, | |
| "learning_rate": 3e-06, | |
| "loss": 0.7423, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.03491162993672267, | |
| "grad_norm": 2.036582614030934, | |
| "learning_rate": 3.4347826086956526e-06, | |
| "loss": 0.7193, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.039275583678813, | |
| "grad_norm": 1.9487163653762136, | |
| "learning_rate": 3.869565217391304e-06, | |
| "loss": 0.6965, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.04363953742090334, | |
| "grad_norm": 1.8884072181165537, | |
| "learning_rate": 4.304347826086957e-06, | |
| "loss": 0.6863, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04800349116299367, | |
| "grad_norm": 1.9672176130957406, | |
| "learning_rate": 4.739130434782609e-06, | |
| "loss": 0.6761, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.05236744490508401, | |
| "grad_norm": 1.9175860223528627, | |
| "learning_rate": 5.173913043478262e-06, | |
| "loss": 0.669, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.05673139864717434, | |
| "grad_norm": 4.64170340673796, | |
| "learning_rate": 5.608695652173914e-06, | |
| "loss": 0.6553, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.061095352389264675, | |
| "grad_norm": 2.4419914657221766, | |
| "learning_rate": 6.043478260869565e-06, | |
| "loss": 0.6635, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.065459306131355, | |
| "grad_norm": 2.219811974469962, | |
| "learning_rate": 6.478260869565218e-06, | |
| "loss": 0.6417, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.06982325987344534, | |
| "grad_norm": 1.8731532270945315, | |
| "learning_rate": 6.91304347826087e-06, | |
| "loss": 0.6427, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.07418721361553568, | |
| "grad_norm": 1.9110914927729745, | |
| "learning_rate": 7.347826086956522e-06, | |
| "loss": 0.6442, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.078551167357626, | |
| "grad_norm": 1.9295608959806378, | |
| "learning_rate": 7.782608695652174e-06, | |
| "loss": 0.6369, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.08291512109971634, | |
| "grad_norm": 1.7914358748612738, | |
| "learning_rate": 8.217391304347827e-06, | |
| "loss": 0.639, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.08727907484180668, | |
| "grad_norm": 1.9446891106680222, | |
| "learning_rate": 8.65217391304348e-06, | |
| "loss": 0.6414, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.09164302858389702, | |
| "grad_norm": 2.1347110290826032, | |
| "learning_rate": 9.086956521739132e-06, | |
| "loss": 0.6392, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.09600698232598734, | |
| "grad_norm": 1.9030077411906519, | |
| "learning_rate": 9.521739130434784e-06, | |
| "loss": 0.6407, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.10037093606807768, | |
| "grad_norm": 1.9072699967620688, | |
| "learning_rate": 9.956521739130436e-06, | |
| "loss": 0.6296, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.10473488981016801, | |
| "grad_norm": 6.11411147753339, | |
| "learning_rate": 9.999529953691501e-06, | |
| "loss": 0.6364, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.10909884355225835, | |
| "grad_norm": 1.9274840356722631, | |
| "learning_rate": 9.997905215717636e-06, | |
| "loss": 0.6271, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.11346279729434867, | |
| "grad_norm": 1.6084651209610894, | |
| "learning_rate": 9.995120360083208e-06, | |
| "loss": 0.625, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.11782675103643901, | |
| "grad_norm": 1.9201552574955696, | |
| "learning_rate": 9.991176033211077e-06, | |
| "loss": 0.6271, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.12219070477852935, | |
| "grad_norm": 1.8527351849291473, | |
| "learning_rate": 9.98607315066148e-06, | |
| "loss": 0.62, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.12655465852061967, | |
| "grad_norm": 2.1728262084174137, | |
| "learning_rate": 9.979812896919497e-06, | |
| "loss": 0.6234, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.13091861226271, | |
| "grad_norm": 2.5508242501465848, | |
| "learning_rate": 9.97239672512011e-06, | |
| "loss": 0.6228, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.13528256600480035, | |
| "grad_norm": 1.8024605834099054, | |
| "learning_rate": 9.963826356710913e-06, | |
| "loss": 0.6189, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.13964651974689068, | |
| "grad_norm": 1.7899544274401067, | |
| "learning_rate": 9.954103781052508e-06, | |
| "loss": 0.622, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.14401047348898102, | |
| "grad_norm": 2.6634929838451162, | |
| "learning_rate": 9.943231254956749e-06, | |
| "loss": 0.6215, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.14837442723107136, | |
| "grad_norm": 1.6887392545009612, | |
| "learning_rate": 9.93121130216288e-06, | |
| "loss": 0.6095, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.1527383809731617, | |
| "grad_norm": 1.564527068427684, | |
| "learning_rate": 9.918046712751731e-06, | |
| "loss": 0.6073, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.157102334715252, | |
| "grad_norm": 1.7759578419290645, | |
| "learning_rate": 9.903740542498071e-06, | |
| "loss": 0.6095, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.16146628845734234, | |
| "grad_norm": 2.604040660939313, | |
| "learning_rate": 9.888296112161308e-06, | |
| "loss": 0.6039, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.16583024219943268, | |
| "grad_norm": 1.7552723705971771, | |
| "learning_rate": 9.87171700671467e-06, | |
| "loss": 0.6049, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.17019419594152302, | |
| "grad_norm": 2.786695194524141, | |
| "learning_rate": 9.854007074513056e-06, | |
| "loss": 0.6069, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.17455814968361336, | |
| "grad_norm": 1.6966428135982474, | |
| "learning_rate": 9.835170426399757e-06, | |
| "loss": 0.6067, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1789221034257037, | |
| "grad_norm": 3.1900936362907073, | |
| "learning_rate": 9.815211434752236e-06, | |
| "loss": 0.6021, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.18328605716779403, | |
| "grad_norm": 2.0950790085022795, | |
| "learning_rate": 9.79413473246722e-06, | |
| "loss": 0.6064, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.18765001090988437, | |
| "grad_norm": 1.8573324559272892, | |
| "learning_rate": 9.771945211885294e-06, | |
| "loss": 0.5962, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.19201396465197468, | |
| "grad_norm": 1.7147053708189852, | |
| "learning_rate": 9.748648023655302e-06, | |
| "loss": 0.6005, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.19637791839406502, | |
| "grad_norm": 1.620349780247674, | |
| "learning_rate": 9.724248575538764e-06, | |
| "loss": 0.5922, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.20074187213615535, | |
| "grad_norm": 1.654417334838078, | |
| "learning_rate": 9.698752531154623e-06, | |
| "loss": 0.5977, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2051058258782457, | |
| "grad_norm": 3.7007534847341335, | |
| "learning_rate": 9.672165808664609e-06, | |
| "loss": 0.5989, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.20946977962033603, | |
| "grad_norm": 1.8306160811104213, | |
| "learning_rate": 9.6444945793995e-06, | |
| "loss": 0.5914, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.21383373336242636, | |
| "grad_norm": 1.9518306293763308, | |
| "learning_rate": 9.61574526642664e-06, | |
| "loss": 0.5883, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2181976871045167, | |
| "grad_norm": 1.4825723400886388, | |
| "learning_rate": 9.585924543058997e-06, | |
| "loss": 0.591, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2181976871045167, | |
| "eval_loss": 0.5827435255050659, | |
| "eval_runtime": 69.7858, | |
| "eval_samples_per_second": 53.062, | |
| "eval_steps_per_second": 3.324, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.22256164084660704, | |
| "grad_norm": 1.480013652739435, | |
| "learning_rate": 9.555039331306164e-06, | |
| "loss": 0.5893, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.22692559458869735, | |
| "grad_norm": 1.697064020550412, | |
| "learning_rate": 9.523096800267602e-06, | |
| "loss": 0.5967, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2312895483307877, | |
| "grad_norm": 1.4577169028061336, | |
| "learning_rate": 9.490104364468557e-06, | |
| "loss": 0.5876, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.23565350207287802, | |
| "grad_norm": 1.407863250608549, | |
| "learning_rate": 9.456069682138981e-06, | |
| "loss": 0.5879, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.24001745581496836, | |
| "grad_norm": 1.5047099371255703, | |
| "learning_rate": 9.421000653435908e-06, | |
| "loss": 0.5898, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.2443814095570587, | |
| "grad_norm": 1.5343210297318903, | |
| "learning_rate": 9.38490541860966e-06, | |
| "loss": 0.5938, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.24874536329914904, | |
| "grad_norm": 1.4956667878525625, | |
| "learning_rate": 9.347792356114324e-06, | |
| "loss": 0.5893, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.25310931704123935, | |
| "grad_norm": 1.5101062644216703, | |
| "learning_rate": 9.30967008066294e-06, | |
| "loss": 0.5976, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.2574732707833297, | |
| "grad_norm": 1.5236342080851415, | |
| "learning_rate": 9.27054744122785e-06, | |
| "loss": 0.592, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.26183722452542, | |
| "grad_norm": 1.6140869174258692, | |
| "learning_rate": 9.230433518986664e-06, | |
| "loss": 0.5888, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.2662011782675104, | |
| "grad_norm": 1.419486174653226, | |
| "learning_rate": 9.189337625214324e-06, | |
| "loss": 0.5842, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.2705651320096007, | |
| "grad_norm": 1.5576946265406408, | |
| "learning_rate": 9.147269299121782e-06, | |
| "loss": 0.582, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.274929085751691, | |
| "grad_norm": 1.4457336642256946, | |
| "learning_rate": 9.104238305641731e-06, | |
| "loss": 0.579, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.27929303949378137, | |
| "grad_norm": 1.5422960552682006, | |
| "learning_rate": 9.060254633161973e-06, | |
| "loss": 0.5889, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.2836569932358717, | |
| "grad_norm": 1.5024597086356501, | |
| "learning_rate": 9.015328491206901e-06, | |
| "loss": 0.5793, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.28802094697796204, | |
| "grad_norm": 1.3686070974882203, | |
| "learning_rate": 8.969470308067662e-06, | |
| "loss": 0.5807, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.29238490072005235, | |
| "grad_norm": 2.6998427178862894, | |
| "learning_rate": 8.922690728381527e-06, | |
| "loss": 0.5748, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.2967488544621427, | |
| "grad_norm": 2.065649261696864, | |
| "learning_rate": 8.875000610661052e-06, | |
| "loss": 0.5688, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.30111280820423303, | |
| "grad_norm": 1.5958849520041398, | |
| "learning_rate": 8.826411024773595e-06, | |
| "loss": 0.5727, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.3054767619463234, | |
| "grad_norm": 1.4305834347213016, | |
| "learning_rate": 8.776933249371769e-06, | |
| "loss": 0.5621, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3098407156884137, | |
| "grad_norm": 1.4826756109906936, | |
| "learning_rate": 8.726578769275426e-06, | |
| "loss": 0.57, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.314204669430504, | |
| "grad_norm": 1.4919661778521895, | |
| "learning_rate": 8.675359272805796e-06, | |
| "loss": 0.5733, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.3185686231725944, | |
| "grad_norm": 1.539486119661036, | |
| "learning_rate": 8.62328664907239e-06, | |
| "loss": 0.5617, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.3229325769146847, | |
| "grad_norm": 2.412337158368947, | |
| "learning_rate": 8.570372985213283e-06, | |
| "loss": 0.5748, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.32729653065677505, | |
| "grad_norm": 1.4949607784946515, | |
| "learning_rate": 8.516630563589436e-06, | |
| "loss": 0.5765, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.33166048439886536, | |
| "grad_norm": 2.1389988555015527, | |
| "learning_rate": 8.462071858933717e-06, | |
| "loss": 0.5717, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.33602443814095573, | |
| "grad_norm": 1.7992243083658257, | |
| "learning_rate": 8.406709535455242e-06, | |
| "loss": 0.5728, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.34038839188304604, | |
| "grad_norm": 1.7459966293740241, | |
| "learning_rate": 8.35055644389976e-06, | |
| "loss": 0.5621, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.34475234562513635, | |
| "grad_norm": 1.6584257041747206, | |
| "learning_rate": 8.293625618566723e-06, | |
| "loss": 0.5676, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.3491162993672267, | |
| "grad_norm": 2.234742145469745, | |
| "learning_rate": 8.23593027428375e-06, | |
| "loss": 0.5657, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.353480253109317, | |
| "grad_norm": 1.4160485895062545, | |
| "learning_rate": 8.177483803339203e-06, | |
| "loss": 0.5739, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.3578442068514074, | |
| "grad_norm": 1.6236310717790317, | |
| "learning_rate": 8.118299772373546e-06, | |
| "loss": 0.5708, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.3622081605934977, | |
| "grad_norm": 1.4409935142518557, | |
| "learning_rate": 8.05839191923025e-06, | |
| "loss": 0.5652, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.36657211433558806, | |
| "grad_norm": 1.7422827738179059, | |
| "learning_rate": 7.997774149766962e-06, | |
| "loss": 0.5665, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.37093606807767837, | |
| "grad_norm": 1.4654585573940748, | |
| "learning_rate": 7.936460534627648e-06, | |
| "loss": 0.5592, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.37530002181976874, | |
| "grad_norm": 1.5684193622827431, | |
| "learning_rate": 7.874465305976527e-06, | |
| "loss": 0.558, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.37966397556185905, | |
| "grad_norm": 1.8405640707393334, | |
| "learning_rate": 7.81180285419447e-06, | |
| "loss": 0.5653, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.38402792930394936, | |
| "grad_norm": 1.475228530845922, | |
| "learning_rate": 7.748487724538707e-06, | |
| "loss": 0.5587, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.3883918830460397, | |
| "grad_norm": 1.331005791046321, | |
| "learning_rate": 7.684534613766566e-06, | |
| "loss": 0.574, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.39275583678813003, | |
| "grad_norm": 1.3582718830868108, | |
| "learning_rate": 7.619958366724043e-06, | |
| "loss": 0.5657, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.3971197905302204, | |
| "grad_norm": 1.3639931742020133, | |
| "learning_rate": 7.554773972900024e-06, | |
| "loss": 0.5642, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.4014837442723107, | |
| "grad_norm": 1.4421529537691287, | |
| "learning_rate": 7.488996562946886e-06, | |
| "loss": 0.5585, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.40584769801440107, | |
| "grad_norm": 1.4453139064733957, | |
| "learning_rate": 7.422641405168386e-06, | |
| "loss": 0.5611, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.4102116517564914, | |
| "grad_norm": 1.41874403862543, | |
| "learning_rate": 7.355723901975546e-06, | |
| "loss": 0.5556, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.4145756054985817, | |
| "grad_norm": 1.569355186279227, | |
| "learning_rate": 7.288259586311455e-06, | |
| "loss": 0.5638, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.41893955924067205, | |
| "grad_norm": 1.340010425244894, | |
| "learning_rate": 7.22026411804573e-06, | |
| "loss": 0.554, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.42330351298276236, | |
| "grad_norm": 1.4102271417835663, | |
| "learning_rate": 7.151753280339554e-06, | |
| "loss": 0.5574, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.42766746672485273, | |
| "grad_norm": 1.4168840313026727, | |
| "learning_rate": 7.082742975982057e-06, | |
| "loss": 0.5613, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.43203142046694304, | |
| "grad_norm": 1.4932003760222727, | |
| "learning_rate": 7.0132492236989694e-06, | |
| "loss": 0.5549, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.4363953742090334, | |
| "grad_norm": 1.3727246781695526, | |
| "learning_rate": 6.94328815443432e-06, | |
| "loss": 0.5605, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4363953742090334, | |
| "eval_loss": 0.545971155166626, | |
| "eval_runtime": 68.1299, | |
| "eval_samples_per_second": 54.352, | |
| "eval_steps_per_second": 3.405, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4407593279511237, | |
| "grad_norm": 1.5610600215808323, | |
| "learning_rate": 6.872876007606127e-06, | |
| "loss": 0.5505, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.4451232816932141, | |
| "grad_norm": 1.2665441451258144, | |
| "learning_rate": 6.802029127336884e-06, | |
| "loss": 0.5511, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.4494872354353044, | |
| "grad_norm": 1.4767565256072326, | |
| "learning_rate": 6.73076395865975e-06, | |
| "loss": 0.5479, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.4538511891773947, | |
| "grad_norm": 1.3450898726192824, | |
| "learning_rate": 6.6590970437013135e-06, | |
| "loss": 0.5549, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.45821514291948506, | |
| "grad_norm": 1.3691285786320522, | |
| "learning_rate": 6.587045017841828e-06, | |
| "loss": 0.5566, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.4625790966615754, | |
| "grad_norm": 1.407701866024824, | |
| "learning_rate": 6.514624605853785e-06, | |
| "loss": 0.5519, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.46694305040366574, | |
| "grad_norm": 1.7016011593861633, | |
| "learning_rate": 6.441852618019757e-06, | |
| "loss": 0.5453, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.47130700414575605, | |
| "grad_norm": 2.0081488788972495, | |
| "learning_rate": 6.368745946230371e-06, | |
| "loss": 0.5509, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.4756709578878464, | |
| "grad_norm": 1.4313272433739415, | |
| "learning_rate": 6.295321560063358e-06, | |
| "loss": 0.5558, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.4800349116299367, | |
| "grad_norm": 1.3908306510503996, | |
| "learning_rate": 6.221596502844558e-06, | |
| "loss": 0.5496, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.48439886537202703, | |
| "grad_norm": 1.4083635646126667, | |
| "learning_rate": 6.147587887691812e-06, | |
| "loss": 0.5447, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.4887628191141174, | |
| "grad_norm": 17.48008864471473, | |
| "learning_rate": 6.073312893542644e-06, | |
| "loss": 0.5507, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.4931267728562077, | |
| "grad_norm": 1.7242442516057832, | |
| "learning_rate": 5.998788761166689e-06, | |
| "loss": 0.5497, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.49749072659829807, | |
| "grad_norm": 1.6226845226734892, | |
| "learning_rate": 5.9240327891637296e-06, | |
| "loss": 0.5521, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.5018546803403884, | |
| "grad_norm": 1.6604002116905412, | |
| "learning_rate": 5.849062329948353e-06, | |
| "loss": 0.5451, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.5062186340824787, | |
| "grad_norm": 1.5873724924653319, | |
| "learning_rate": 5.773894785722082e-06, | |
| "loss": 0.547, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.5105825878245691, | |
| "grad_norm": 1.4609962799328036, | |
| "learning_rate": 5.698547604433963e-06, | |
| "loss": 0.551, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.5149465415666594, | |
| "grad_norm": 2.35781566677412, | |
| "learning_rate": 5.623038275730543e-06, | |
| "loss": 0.5482, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.5193104953087497, | |
| "grad_norm": 1.7956175108962174, | |
| "learning_rate": 5.547384326896152e-06, | |
| "loss": 0.5378, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.52367444905084, | |
| "grad_norm": 1.3754392835784166, | |
| "learning_rate": 5.4716033187844565e-06, | |
| "loss": 0.5289, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5280384027929304, | |
| "grad_norm": 1.351125298582442, | |
| "learning_rate": 5.3957128417422196e-06, | |
| "loss": 0.5375, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.5324023565350208, | |
| "grad_norm": 1.2885359407090353, | |
| "learning_rate": 5.319730511526225e-06, | |
| "loss": 0.5484, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.5367663102771111, | |
| "grad_norm": 1.4931791483959027, | |
| "learning_rate": 5.243673965214276e-06, | |
| "loss": 0.5352, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.5411302640192014, | |
| "grad_norm": 1.3301082529926556, | |
| "learning_rate": 5.167560857111286e-06, | |
| "loss": 0.5407, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.5454942177612917, | |
| "grad_norm": 1.3023885557832229, | |
| "learning_rate": 5.091408854651327e-06, | |
| "loss": 0.5354, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.549858171503382, | |
| "grad_norm": 1.8840653667484866, | |
| "learning_rate": 5.0152356342966624e-06, | |
| "loss": 0.5425, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.5542221252454724, | |
| "grad_norm": 1.3585936189959857, | |
| "learning_rate": 4.939058877434672e-06, | |
| "loss": 0.5386, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.5585860789875627, | |
| "grad_norm": 1.4612137856640806, | |
| "learning_rate": 4.862896266273627e-06, | |
| "loss": 0.5512, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.562950032729653, | |
| "grad_norm": 1.3030406905808916, | |
| "learning_rate": 4.786765479738293e-06, | |
| "loss": 0.5422, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.5673139864717434, | |
| "grad_norm": 1.3169625647849659, | |
| "learning_rate": 4.7106841893662755e-06, | |
| "loss": 0.5377, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5716779402138338, | |
| "grad_norm": 1.4365133088202295, | |
| "learning_rate": 4.634670055206092e-06, | |
| "loss": 0.5429, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.5760418939559241, | |
| "grad_norm": 1.400666928678306, | |
| "learning_rate": 4.5587407217179094e-06, | |
| "loss": 0.5441, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.5804058476980144, | |
| "grad_norm": 1.4020862611201026, | |
| "learning_rate": 4.482913813677896e-06, | |
| "loss": 0.5372, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.5847698014401047, | |
| "grad_norm": 1.3899074056942011, | |
| "learning_rate": 4.407206932087143e-06, | |
| "loss": 0.5431, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.589133755182195, | |
| "grad_norm": 1.2780180293099543, | |
| "learning_rate": 4.331637650086111e-06, | |
| "loss": 0.5354, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.5934977089242854, | |
| "grad_norm": 1.3047906646065353, | |
| "learning_rate": 4.256223508875536e-06, | |
| "loss": 0.536, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.5978616626663757, | |
| "grad_norm": 2.2450550137169025, | |
| "learning_rate": 4.180982013644749e-06, | |
| "loss": 0.5345, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.6022256164084661, | |
| "grad_norm": 1.2604878888770283, | |
| "learning_rate": 4.105930629508369e-06, | |
| "loss": 0.5349, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.6065895701505564, | |
| "grad_norm": 1.4408134764916174, | |
| "learning_rate": 4.0310867774522724e-06, | |
| "loss": 0.5338, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.6109535238926468, | |
| "grad_norm": 1.194095025493141, | |
| "learning_rate": 3.95646783028983e-06, | |
| "loss": 0.5332, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.6153174776347371, | |
| "grad_norm": 1.3682237212576729, | |
| "learning_rate": 3.882091108629311e-06, | |
| "loss": 0.5305, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.6196814313768274, | |
| "grad_norm": 1.3808616625009607, | |
| "learning_rate": 3.807973876853414e-06, | |
| "loss": 0.5333, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.6240453851189177, | |
| "grad_norm": 1.3166679869381683, | |
| "learning_rate": 3.734133339111844e-06, | |
| "loss": 0.5305, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.628409338861008, | |
| "grad_norm": 2.075662539665934, | |
| "learning_rate": 3.660586635327869e-06, | |
| "loss": 0.5386, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.6327732926030984, | |
| "grad_norm": 1.3904648380650928, | |
| "learning_rate": 3.587350837219788e-06, | |
| "loss": 0.5396, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.6371372463451888, | |
| "grad_norm": 1.492731925374217, | |
| "learning_rate": 3.5144429443382356e-06, | |
| "loss": 0.5366, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.6415012000872791, | |
| "grad_norm": 1.2451404313444014, | |
| "learning_rate": 3.4418798801202256e-06, | |
| "loss": 0.5255, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.6458651538293694, | |
| "grad_norm": 1.5668762418599276, | |
| "learning_rate": 3.3696784879608747e-06, | |
| "loss": 0.5307, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.6502291075714598, | |
| "grad_norm": 1.4372909935764755, | |
| "learning_rate": 3.2978555273037006e-06, | |
| "loss": 0.5269, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.6545930613135501, | |
| "grad_norm": 1.3776181734494666, | |
| "learning_rate": 3.2264276697504026e-06, | |
| "loss": 0.5252, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6545930613135501, | |
| "eval_loss": 0.5199297666549683, | |
| "eval_runtime": 69.8992, | |
| "eval_samples_per_second": 52.976, | |
| "eval_steps_per_second": 3.319, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6589570150556404, | |
| "grad_norm": 1.4365144824102365, | |
| "learning_rate": 3.1554114951910387e-06, | |
| "loss": 0.5312, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.6633209687977307, | |
| "grad_norm": 1.2422304913068782, | |
| "learning_rate": 3.0848234879554916e-06, | |
| "loss": 0.5283, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.667684922539821, | |
| "grad_norm": 1.2495828228531431, | |
| "learning_rate": 3.0146800329871107e-06, | |
| "loss": 0.5359, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.6720488762819115, | |
| "grad_norm": 1.3009383931934748, | |
| "learning_rate": 2.944997412039422e-06, | |
| "loss": 0.5286, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.6764128300240018, | |
| "grad_norm": 1.3215444464354889, | |
| "learning_rate": 2.8757917998968042e-06, | |
| "loss": 0.5345, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.6807767837660921, | |
| "grad_norm": 1.199193600939233, | |
| "learning_rate": 2.807079260619976e-06, | |
| "loss": 0.5226, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.6851407375081824, | |
| "grad_norm": 1.3916367911407932, | |
| "learning_rate": 2.7388757438171953e-06, | |
| "loss": 0.5294, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.6895046912502727, | |
| "grad_norm": 1.2821114411807195, | |
| "learning_rate": 2.6711970809420327e-06, | |
| "loss": 0.5243, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.6938686449923631, | |
| "grad_norm": 1.2276903910497599, | |
| "learning_rate": 2.6040589816185534e-06, | |
| "loss": 0.5227, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.6982325987344534, | |
| "grad_norm": 1.2534508386414063, | |
| "learning_rate": 2.5374770299947837e-06, | |
| "loss": 0.5336, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.7025965524765437, | |
| "grad_norm": 1.2952567981586396, | |
| "learning_rate": 2.471466681125316e-06, | |
| "loss": 0.5242, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.706960506218634, | |
| "grad_norm": 1.2491075248862247, | |
| "learning_rate": 2.4060432573838686e-06, | |
| "loss": 0.5229, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.7113244599607245, | |
| "grad_norm": 1.3070997549733596, | |
| "learning_rate": 2.3412219449066316e-06, | |
| "loss": 0.5238, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.7156884137028148, | |
| "grad_norm": 1.2500083040545036, | |
| "learning_rate": 2.2770177900672658e-06, | |
| "loss": 0.527, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.7200523674449051, | |
| "grad_norm": 1.2464501526769323, | |
| "learning_rate": 2.213445695984318e-06, | |
| "loss": 0.5208, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.7244163211869954, | |
| "grad_norm": 1.2752274285884029, | |
| "learning_rate": 2.150520419061896e-06, | |
| "loss": 0.5268, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.7287802749290857, | |
| "grad_norm": 1.243680862493251, | |
| "learning_rate": 2.0882565655644054e-06, | |
| "loss": 0.5233, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.7331442286711761, | |
| "grad_norm": 1.6747464378055472, | |
| "learning_rate": 2.026668588226133e-06, | |
| "loss": 0.5254, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.7375081824132664, | |
| "grad_norm": 1.2620103056544976, | |
| "learning_rate": 1.965770782896455e-06, | |
| "loss": 0.5224, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.7418721361553567, | |
| "grad_norm": 1.3278419822937977, | |
| "learning_rate": 1.9055772852214916e-06, | |
| "loss": 0.515, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.746236089897447, | |
| "grad_norm": 1.6987330727891572, | |
| "learning_rate": 1.8461020673629172e-06, | |
| "loss": 0.5144, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.7506000436395375, | |
| "grad_norm": 1.3497988296955354, | |
| "learning_rate": 1.7873589347547321e-06, | |
| "loss": 0.5173, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.7549639973816278, | |
| "grad_norm": 1.6208371219517643, | |
| "learning_rate": 1.729361522898737e-06, | |
| "loss": 0.5228, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.7593279511237181, | |
| "grad_norm": 1.308791373132814, | |
| "learning_rate": 1.6721232941994526e-06, | |
| "loss": 0.5282, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.7636919048658084, | |
| "grad_norm": 1.2618118893101176, | |
| "learning_rate": 1.6156575348392122e-06, | |
| "loss": 0.5208, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.7680558586078987, | |
| "grad_norm": 1.3104736125525216, | |
| "learning_rate": 1.559977351694158e-06, | |
| "loss": 0.5208, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.7724198123499891, | |
| "grad_norm": 1.2487367116120625, | |
| "learning_rate": 1.5050956692918739e-06, | |
| "loss": 0.5186, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.7767837660920794, | |
| "grad_norm": 1.156634335046638, | |
| "learning_rate": 1.4510252268113263e-06, | |
| "loss": 0.5192, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.7811477198341698, | |
| "grad_norm": 1.3021570425828364, | |
| "learning_rate": 1.3977785751258345e-06, | |
| "loss": 0.5211, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.7855116735762601, | |
| "grad_norm": 1.2504588787454711, | |
| "learning_rate": 1.34536807388976e-06, | |
| "loss": 0.5159, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.7898756273183504, | |
| "grad_norm": 1.991690391624192, | |
| "learning_rate": 1.2938058886695643e-06, | |
| "loss": 0.5169, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.7942395810604408, | |
| "grad_norm": 1.4484577791210265, | |
| "learning_rate": 1.2431039881199374e-06, | |
| "loss": 0.5246, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.7986035348025311, | |
| "grad_norm": 1.2031874504717428, | |
| "learning_rate": 1.1932741412056187e-06, | |
| "loss": 0.5238, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.8029674885446214, | |
| "grad_norm": 1.4251143516882596, | |
| "learning_rate": 1.1443279144695746e-06, | |
| "loss": 0.5202, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.8073314422867117, | |
| "grad_norm": 1.428433930967254, | |
| "learning_rate": 1.0962766693481686e-06, | |
| "loss": 0.5178, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.8116953960288021, | |
| "grad_norm": 1.3957444052098604, | |
| "learning_rate": 1.049131559533933e-06, | |
| "loss": 0.5142, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.8160593497708925, | |
| "grad_norm": 1.7513113973659105, | |
| "learning_rate": 1.002903528386564e-06, | |
| "loss": 0.5188, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.8204233035129828, | |
| "grad_norm": 1.3254734615581885, | |
| "learning_rate": 9.576033063927398e-07, | |
| "loss": 0.5207, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.8247872572550731, | |
| "grad_norm": 1.3416379717629452, | |
| "learning_rate": 9.132414086753578e-07, | |
| "loss": 0.5134, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.8291512109971634, | |
| "grad_norm": 1.2636084433299684, | |
| "learning_rate": 8.698281325527502e-07, | |
| "loss": 0.5144, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.8335151647392538, | |
| "grad_norm": 1.718038484652991, | |
| "learning_rate": 8.273735551484613e-07, | |
| "loss": 0.5158, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.8378791184813441, | |
| "grad_norm": 1.365124227466459, | |
| "learning_rate": 7.858875310521447e-07, | |
| "loss": 0.5116, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.8422430722234344, | |
| "grad_norm": 1.2857842722903012, | |
| "learning_rate": 7.45379690032102e-07, | |
| "loss": 0.5213, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.8466070259655247, | |
| "grad_norm": 1.8949074875975938, | |
| "learning_rate": 7.058594348000142e-07, | |
| "loss": 0.5126, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.8509709797076151, | |
| "grad_norm": 2.0508297274530425, | |
| "learning_rate": 6.673359388283796e-07, | |
| "loss": 0.5118, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.8553349334497055, | |
| "grad_norm": 1.2982410941631513, | |
| "learning_rate": 6.29818144221161e-07, | |
| "loss": 0.5256, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.8596988871917958, | |
| "grad_norm": 1.3881080528727965, | |
| "learning_rate": 5.933147596381295e-07, | |
| "loss": 0.5178, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.8640628409338861, | |
| "grad_norm": 1.2689567204311927, | |
| "learning_rate": 5.578342582734153e-07, | |
| "loss": 0.5106, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.8684267946759764, | |
| "grad_norm": 1.3514399003622013, | |
| "learning_rate": 5.233848758886945e-07, | |
| "loss": 0.515, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.8727907484180668, | |
| "grad_norm": 1.8878476820927994, | |
| "learning_rate": 4.899746089015006e-07, | |
| "loss": 0.5075, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8727907484180668, | |
| "eval_loss": 0.5055009722709656, | |
| "eval_runtime": 68.9834, | |
| "eval_samples_per_second": 53.68, | |
| "eval_steps_per_second": 3.363, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8771547021601571, | |
| "grad_norm": 1.1864343347355735, | |
| "learning_rate": 4.57611212529091e-07, | |
| "loss": 0.5118, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.8815186559022474, | |
| "grad_norm": 1.3391728742114728, | |
| "learning_rate": 4.263021989882965e-07, | |
| "loss": 0.5209, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.8858826096443377, | |
| "grad_norm": 1.265995062918622, | |
| "learning_rate": 3.9605483575178517e-07, | |
| "loss": 0.5172, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.8902465633864282, | |
| "grad_norm": 1.3612996127522512, | |
| "learning_rate": 3.6687614386112403e-07, | |
| "loss": 0.51, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.8946105171285185, | |
| "grad_norm": 1.1822823920287409, | |
| "learning_rate": 3.38772896297056e-07, | |
| "loss": 0.512, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.8989744708706088, | |
| "grad_norm": 1.4239619500028629, | |
| "learning_rate": 3.117516164073459e-07, | |
| "loss": 0.519, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.9033384246126991, | |
| "grad_norm": 1.3503881714270254, | |
| "learning_rate": 2.858185763925764e-07, | |
| "loss": 0.5104, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.9077023783547894, | |
| "grad_norm": 1.3564880256851914, | |
| "learning_rate": 2.6097979585024193e-07, | |
| "loss": 0.5136, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.9120663320968798, | |
| "grad_norm": 1.2482713788944904, | |
| "learning_rate": 2.3724104037747553e-07, | |
| "loss": 0.5085, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.9164302858389701, | |
| "grad_norm": 1.4746577210230283, | |
| "learning_rate": 2.146078202327284e-07, | |
| "loss": 0.5159, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.9207942395810604, | |
| "grad_norm": 2.19277205698307, | |
| "learning_rate": 1.9308538905673135e-07, | |
| "loss": 0.5118, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.9251581933231507, | |
| "grad_norm": 1.3172149163534492, | |
| "learning_rate": 1.7267874265301187e-07, | |
| "loss": 0.5139, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.9295221470652411, | |
| "grad_norm": 1.217984783068752, | |
| "learning_rate": 1.533926178282641e-07, | |
| "loss": 0.5084, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.9338861008073315, | |
| "grad_norm": 1.3602984814635521, | |
| "learning_rate": 1.3523149129283987e-07, | |
| "loss": 0.5222, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.9382500545494218, | |
| "grad_norm": 1.3460153750821495, | |
| "learning_rate": 1.1819957862160835e-07, | |
| "loss": 0.5069, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.9426140082915121, | |
| "grad_norm": 1.4510294571460467, | |
| "learning_rate": 1.0230083327543661e-07, | |
| "loss": 0.5142, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.9469779620336024, | |
| "grad_norm": 1.396143496017602, | |
| "learning_rate": 8.753894568350573e-08, | |
| "loss": 0.5155, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.9513419157756928, | |
| "grad_norm": 1.437628267014867, | |
| "learning_rate": 7.391734238668846e-08, | |
| "loss": 0.5074, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.9557058695177831, | |
| "grad_norm": 1.3148582495813304, | |
| "learning_rate": 6.143918524217696e-08, | |
| "loss": 0.5176, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.9600698232598734, | |
| "grad_norm": 1.267741565404716, | |
| "learning_rate": 5.01073706895483e-08, | |
| "loss": 0.5137, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.9644337770019638, | |
| "grad_norm": 1.2859440475396895, | |
| "learning_rate": 3.992452907844502e-08, | |
| "loss": 0.5142, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.9687977307440541, | |
| "grad_norm": 1.2296422531635047, | |
| "learning_rate": 3.08930240580102e-08, | |
| "loss": 0.5025, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.9731616844861445, | |
| "grad_norm": 1.3046489064561737, | |
| "learning_rate": 2.3014952028240223e-08, | |
| "loss": 0.5138, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.9775256382282348, | |
| "grad_norm": 1.238897807436414, | |
| "learning_rate": 1.6292141653366232e-08, | |
| "loss": 0.5067, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.9818895919703251, | |
| "grad_norm": 1.2689478765861664, | |
| "learning_rate": 1.0726153437379816e-08, | |
| "loss": 0.5188, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.9862535457124154, | |
| "grad_norm": 2.492246786468264, | |
| "learning_rate": 6.3182793618110634e-09, | |
| "loss": 0.5151, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.9906174994545058, | |
| "grad_norm": 1.2722668029711148, | |
| "learning_rate": 3.0695425858329187e-09, | |
| "loss": 0.5197, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.9949814531965961, | |
| "grad_norm": 1.3363964992431479, | |
| "learning_rate": 9.806972087605949e-10, | |
| "loss": 0.5223, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.9993454069386865, | |
| "grad_norm": 1.5575122083398747, | |
| "learning_rate": 5.222809501492787e-11, | |
| "loss": 0.5178, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 2292, | |
| "total_flos": 2450977789313024.0, | |
| "train_loss": 0.5714389552828737, | |
| "train_runtime": 14627.8115, | |
| "train_samples_per_second": 25.06, | |
| "train_steps_per_second": 0.157 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2292, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2450977789313024.0, | |
| "train_batch_size": 10, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |