{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.6109660574412534, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013054830287206266, "grad_norm": 43.415225982666016, "learning_rate": 4.978241949521324e-05, "loss": 2.9884, "step": 10 }, { "epoch": 0.02610966057441253, "grad_norm": 49.437705993652344, "learning_rate": 4.956483899042646e-05, "loss": 2.9654, "step": 20 }, { "epoch": 0.0391644908616188, "grad_norm": 47.24225997924805, "learning_rate": 4.934725848563969e-05, "loss": 3.5102, "step": 30 }, { "epoch": 0.05221932114882506, "grad_norm": 32.93499755859375, "learning_rate": 4.912967798085292e-05, "loss": 2.4143, "step": 40 }, { "epoch": 0.06527415143603134, "grad_norm": 31.01405143737793, "learning_rate": 4.891209747606615e-05, "loss": 2.6864, "step": 50 }, { "epoch": 0.0783289817232376, "grad_norm": 77.32862854003906, "learning_rate": 4.8694516971279375e-05, "loss": 2.7403, "step": 60 }, { "epoch": 0.09138381201044386, "grad_norm": 28.313295364379883, "learning_rate": 4.84769364664926e-05, "loss": 2.5107, "step": 70 }, { "epoch": 0.10443864229765012, "grad_norm": 38.45579528808594, "learning_rate": 4.825935596170583e-05, "loss": 2.6546, "step": 80 }, { "epoch": 0.1174934725848564, "grad_norm": 41.91643142700195, "learning_rate": 4.8041775456919065e-05, "loss": 2.82, "step": 90 }, { "epoch": 0.13054830287206268, "grad_norm": 36.32301712036133, "learning_rate": 4.782419495213229e-05, "loss": 2.9063, "step": 100 }, { "epoch": 0.14360313315926893, "grad_norm": 30.05735969543457, "learning_rate": 4.760661444734552e-05, "loss": 2.5942, "step": 110 }, { "epoch": 0.1566579634464752, "grad_norm": 38.422706604003906, "learning_rate": 4.738903394255875e-05, "loss": 2.5929, "step": 120 }, { "epoch": 0.16971279373368145, "grad_norm": 49.74126052856445, "learning_rate": 4.7171453437771976e-05, "loss": 2.5271, "step": 130 }, { "epoch": 0.18276762402088773, "grad_norm": 31.077625274658203, "learning_rate": 4.6953872932985203e-05, "loss": 2.6357, "step": 140 }, { "epoch": 0.195822454308094, "grad_norm": 24.155317306518555, "learning_rate": 4.673629242819844e-05, "loss": 2.5678, "step": 150 }, { "epoch": 0.20887728459530025, "grad_norm": 30.788408279418945, "learning_rate": 4.651871192341166e-05, "loss": 2.8182, "step": 160 }, { "epoch": 0.22193211488250653, "grad_norm": 25.434738159179688, "learning_rate": 4.630113141862489e-05, "loss": 3.303, "step": 170 }, { "epoch": 0.2349869451697128, "grad_norm": 37.103668212890625, "learning_rate": 4.608355091383813e-05, "loss": 2.5012, "step": 180 }, { "epoch": 0.24804177545691905, "grad_norm": 70.19502258300781, "learning_rate": 4.586597040905135e-05, "loss": 2.812, "step": 190 }, { "epoch": 0.26109660574412535, "grad_norm": 37.93436050415039, "learning_rate": 4.564838990426458e-05, "loss": 2.6683, "step": 200 }, { "epoch": 0.2741514360313316, "grad_norm": 149.83016967773438, "learning_rate": 4.543080939947781e-05, "loss": 2.9776, "step": 210 }, { "epoch": 0.28720626631853785, "grad_norm": 31.74551010131836, "learning_rate": 4.521322889469104e-05, "loss": 2.9663, "step": 220 }, { "epoch": 0.3002610966057441, "grad_norm": 34.0869255065918, "learning_rate": 4.4995648389904266e-05, "loss": 2.712, "step": 230 }, { "epoch": 0.3133159268929504, "grad_norm": 28.85022735595703, "learning_rate": 4.47780678851175e-05, "loss": 2.3012, "step": 240 }, { "epoch": 0.3263707571801567, "grad_norm": 36.18962860107422, "learning_rate": 4.456048738033072e-05, "loss": 2.4339, "step": 250 }, { "epoch": 0.3394255874673629, "grad_norm": 27.252077102661133, "learning_rate": 4.4342906875543956e-05, "loss": 2.851, "step": 260 }, { "epoch": 0.3524804177545692, "grad_norm": 38.39606857299805, "learning_rate": 4.4125326370757184e-05, "loss": 2.6867, "step": 270 }, { "epoch": 0.36553524804177545, "grad_norm": 25.8907527923584, "learning_rate": 4.390774586597041e-05, "loss": 2.4467, "step": 280 }, { "epoch": 0.3785900783289817, "grad_norm": 24.98986053466797, "learning_rate": 4.369016536118364e-05, "loss": 3.0278, "step": 290 }, { "epoch": 0.391644908616188, "grad_norm": 24.536916732788086, "learning_rate": 4.347258485639687e-05, "loss": 2.3857, "step": 300 }, { "epoch": 0.4046997389033943, "grad_norm": 22.012798309326172, "learning_rate": 4.3255004351610094e-05, "loss": 2.1414, "step": 310 }, { "epoch": 0.4177545691906005, "grad_norm": 25.466167449951172, "learning_rate": 4.303742384682333e-05, "loss": 2.5714, "step": 320 }, { "epoch": 0.4308093994778068, "grad_norm": 31.785062789916992, "learning_rate": 4.281984334203655e-05, "loss": 2.4608, "step": 330 }, { "epoch": 0.44386422976501305, "grad_norm": 36.67721176147461, "learning_rate": 4.2602262837249784e-05, "loss": 2.787, "step": 340 }, { "epoch": 0.45691906005221933, "grad_norm": 49.02054214477539, "learning_rate": 4.238468233246302e-05, "loss": 2.7207, "step": 350 }, { "epoch": 0.4699738903394256, "grad_norm": 26.876636505126953, "learning_rate": 4.216710182767624e-05, "loss": 2.6041, "step": 360 }, { "epoch": 0.4830287206266319, "grad_norm": 34.956336975097656, "learning_rate": 4.1949521322889474e-05, "loss": 3.1905, "step": 370 }, { "epoch": 0.4960835509138381, "grad_norm": 35.72273254394531, "learning_rate": 4.17319408181027e-05, "loss": 2.121, "step": 380 }, { "epoch": 0.5091383812010444, "grad_norm": 28.895980834960938, "learning_rate": 4.151436031331593e-05, "loss": 2.6956, "step": 390 }, { "epoch": 0.5221932114882507, "grad_norm": 28.925390243530273, "learning_rate": 4.129677980852916e-05, "loss": 3.0316, "step": 400 }, { "epoch": 0.5352480417754569, "grad_norm": 34.79185485839844, "learning_rate": 4.107919930374239e-05, "loss": 3.0389, "step": 410 }, { "epoch": 0.5483028720626631, "grad_norm": 30.246923446655273, "learning_rate": 4.086161879895561e-05, "loss": 2.8086, "step": 420 }, { "epoch": 0.5613577023498695, "grad_norm": 32.78372573852539, "learning_rate": 4.064403829416885e-05, "loss": 2.814, "step": 430 }, { "epoch": 0.5744125326370757, "grad_norm": 24.346147537231445, "learning_rate": 4.0426457789382075e-05, "loss": 2.8039, "step": 440 }, { "epoch": 0.587467362924282, "grad_norm": 31.166654586791992, "learning_rate": 4.02088772845953e-05, "loss": 2.9612, "step": 450 }, { "epoch": 0.6005221932114883, "grad_norm": 23.0938777923584, "learning_rate": 3.999129677980853e-05, "loss": 2.5463, "step": 460 }, { "epoch": 0.6135770234986945, "grad_norm": 26.590911865234375, "learning_rate": 3.977371627502176e-05, "loss": 2.3543, "step": 470 }, { "epoch": 0.6266318537859008, "grad_norm": 29.803422927856445, "learning_rate": 3.9556135770234985e-05, "loss": 2.7445, "step": 480 }, { "epoch": 0.639686684073107, "grad_norm": 46.66853713989258, "learning_rate": 3.933855526544822e-05, "loss": 2.4905, "step": 490 }, { "epoch": 0.6527415143603134, "grad_norm": 39.04319381713867, "learning_rate": 3.912097476066145e-05, "loss": 1.8311, "step": 500 }, { "epoch": 0.6657963446475196, "grad_norm": 30.50276756286621, "learning_rate": 3.8903394255874675e-05, "loss": 2.8152, "step": 510 }, { "epoch": 0.6788511749347258, "grad_norm": 37.25984191894531, "learning_rate": 3.868581375108791e-05, "loss": 2.4759, "step": 520 }, { "epoch": 0.6919060052219321, "grad_norm": 25.89512062072754, "learning_rate": 3.846823324630113e-05, "loss": 2.486, "step": 530 }, { "epoch": 0.7049608355091384, "grad_norm": 37.318450927734375, "learning_rate": 3.8250652741514365e-05, "loss": 2.77, "step": 540 }, { "epoch": 0.7180156657963447, "grad_norm": 34.45144271850586, "learning_rate": 3.803307223672759e-05, "loss": 2.2646, "step": 550 }, { "epoch": 0.7310704960835509, "grad_norm": 50.494144439697266, "learning_rate": 3.781549173194082e-05, "loss": 2.7686, "step": 560 }, { "epoch": 0.7441253263707572, "grad_norm": 29.753643035888672, "learning_rate": 3.759791122715405e-05, "loss": 2.6239, "step": 570 }, { "epoch": 0.7571801566579635, "grad_norm": 39.54145431518555, "learning_rate": 3.738033072236728e-05, "loss": 2.4995, "step": 580 }, { "epoch": 0.7702349869451697, "grad_norm": 36.82713317871094, "learning_rate": 3.71627502175805e-05, "loss": 3.0274, "step": 590 }, { "epoch": 0.783289817232376, "grad_norm": 36.62627410888672, "learning_rate": 3.694516971279374e-05, "loss": 2.2364, "step": 600 }, { "epoch": 0.7963446475195822, "grad_norm": 18.279882431030273, "learning_rate": 3.6727589208006965e-05, "loss": 2.3391, "step": 610 }, { "epoch": 0.8093994778067886, "grad_norm": 23.61455535888672, "learning_rate": 3.651000870322019e-05, "loss": 2.5222, "step": 620 }, { "epoch": 0.8224543080939948, "grad_norm": 32.03522872924805, "learning_rate": 3.629242819843342e-05, "loss": 2.2871, "step": 630 }, { "epoch": 0.835509138381201, "grad_norm": 45.24649429321289, "learning_rate": 3.607484769364665e-05, "loss": 2.6863, "step": 640 }, { "epoch": 0.8485639686684073, "grad_norm": 24.39188575744629, "learning_rate": 3.5857267188859876e-05, "loss": 2.6426, "step": 650 }, { "epoch": 0.8616187989556136, "grad_norm": 21.67547607421875, "learning_rate": 3.563968668407311e-05, "loss": 2.1157, "step": 660 }, { "epoch": 0.8746736292428199, "grad_norm": 24.245168685913086, "learning_rate": 3.542210617928634e-05, "loss": 2.3781, "step": 670 }, { "epoch": 0.8877284595300261, "grad_norm": 27.57684326171875, "learning_rate": 3.5204525674499566e-05, "loss": 2.7673, "step": 680 }, { "epoch": 0.9007832898172323, "grad_norm": 28.42872428894043, "learning_rate": 3.49869451697128e-05, "loss": 2.4503, "step": 690 }, { "epoch": 0.9138381201044387, "grad_norm": 39.387813568115234, "learning_rate": 3.476936466492602e-05, "loss": 2.1765, "step": 700 }, { "epoch": 0.9268929503916449, "grad_norm": 20.197811126708984, "learning_rate": 3.4551784160139256e-05, "loss": 2.555, "step": 710 }, { "epoch": 0.9399477806788512, "grad_norm": 22.066137313842773, "learning_rate": 3.4334203655352484e-05, "loss": 2.4827, "step": 720 }, { "epoch": 0.9530026109660574, "grad_norm": 32.67851638793945, "learning_rate": 3.411662315056571e-05, "loss": 2.762, "step": 730 }, { "epoch": 0.9660574412532638, "grad_norm": 28.471988677978516, "learning_rate": 3.389904264577894e-05, "loss": 2.5872, "step": 740 }, { "epoch": 0.97911227154047, "grad_norm": 22.934885025024414, "learning_rate": 3.368146214099217e-05, "loss": 2.8826, "step": 750 }, { "epoch": 0.9921671018276762, "grad_norm": 24.063716888427734, "learning_rate": 3.3463881636205394e-05, "loss": 2.8909, "step": 760 }, { "epoch": 1.0, "eval_loss": 2.3283705711364746, "eval_runtime": 12.1219, "eval_samples_per_second": 112.276, "eval_steps_per_second": 14.107, "step": 766 }, { "epoch": 1.0052219321148825, "grad_norm": 36.15023422241211, "learning_rate": 3.324630113141863e-05, "loss": 2.5282, "step": 770 }, { "epoch": 1.0182767624020888, "grad_norm": 35.99642562866211, "learning_rate": 3.3028720626631856e-05, "loss": 2.176, "step": 780 }, { "epoch": 1.031331592689295, "grad_norm": 30.217031478881836, "learning_rate": 3.2811140121845084e-05, "loss": 2.2727, "step": 790 }, { "epoch": 1.0443864229765012, "grad_norm": 29.16168212890625, "learning_rate": 3.259355961705831e-05, "loss": 2.0302, "step": 800 }, { "epoch": 1.0574412532637076, "grad_norm": 25.400541305541992, "learning_rate": 3.237597911227154e-05, "loss": 2.3489, "step": 810 }, { "epoch": 1.0704960835509139, "grad_norm": 21.281591415405273, "learning_rate": 3.215839860748477e-05, "loss": 2.3976, "step": 820 }, { "epoch": 1.08355091383812, "grad_norm": 23.941238403320312, "learning_rate": 3.1940818102698e-05, "loss": 2.6169, "step": 830 }, { "epoch": 1.0966057441253263, "grad_norm": 26.626665115356445, "learning_rate": 3.172323759791123e-05, "loss": 2.6102, "step": 840 }, { "epoch": 1.1096605744125327, "grad_norm": 28.539621353149414, "learning_rate": 3.150565709312446e-05, "loss": 2.019, "step": 850 }, { "epoch": 1.122715404699739, "grad_norm": 36.77280044555664, "learning_rate": 3.128807658833769e-05, "loss": 2.2338, "step": 860 }, { "epoch": 1.1357702349869452, "grad_norm": 38.7175407409668, "learning_rate": 3.107049608355091e-05, "loss": 2.7259, "step": 870 }, { "epoch": 1.1488250652741514, "grad_norm": 32.9740104675293, "learning_rate": 3.085291557876415e-05, "loss": 2.294, "step": 880 }, { "epoch": 1.1618798955613576, "grad_norm": 35.01115036010742, "learning_rate": 3.0635335073977374e-05, "loss": 2.5392, "step": 890 }, { "epoch": 1.174934725848564, "grad_norm": 40.960968017578125, "learning_rate": 3.0417754569190606e-05, "loss": 1.8062, "step": 900 }, { "epoch": 1.1879895561357703, "grad_norm": 24.627063751220703, "learning_rate": 3.020017406440383e-05, "loss": 2.746, "step": 910 }, { "epoch": 1.2010443864229765, "grad_norm": 28.861692428588867, "learning_rate": 2.998259355961706e-05, "loss": 2.1434, "step": 920 }, { "epoch": 1.2140992167101827, "grad_norm": 30.32466697692871, "learning_rate": 2.976501305483029e-05, "loss": 2.8282, "step": 930 }, { "epoch": 1.227154046997389, "grad_norm": 45.79476547241211, "learning_rate": 2.954743255004352e-05, "loss": 2.1972, "step": 940 }, { "epoch": 1.2402088772845954, "grad_norm": 34.27708435058594, "learning_rate": 2.9329852045256744e-05, "loss": 3.0359, "step": 950 }, { "epoch": 1.2532637075718016, "grad_norm": 33.62773513793945, "learning_rate": 2.9112271540469975e-05, "loss": 2.4254, "step": 960 }, { "epoch": 1.2663185378590078, "grad_norm": 26.693920135498047, "learning_rate": 2.8894691035683203e-05, "loss": 1.883, "step": 970 }, { "epoch": 1.279373368146214, "grad_norm": 36.45111083984375, "learning_rate": 2.8677110530896434e-05, "loss": 2.232, "step": 980 }, { "epoch": 1.2924281984334205, "grad_norm": 34.38032913208008, "learning_rate": 2.845953002610966e-05, "loss": 2.4289, "step": 990 }, { "epoch": 1.3054830287206267, "grad_norm": 19.068925857543945, "learning_rate": 2.8241949521322892e-05, "loss": 2.473, "step": 1000 }, { "epoch": 1.318537859007833, "grad_norm": 30.37474250793457, "learning_rate": 2.8024369016536117e-05, "loss": 2.6123, "step": 1010 }, { "epoch": 1.3315926892950392, "grad_norm": 29.176149368286133, "learning_rate": 2.7806788511749348e-05, "loss": 2.3158, "step": 1020 }, { "epoch": 1.3446475195822454, "grad_norm": 27.721553802490234, "learning_rate": 2.758920800696258e-05, "loss": 2.3646, "step": 1030 }, { "epoch": 1.3577023498694518, "grad_norm": 12.712676048278809, "learning_rate": 2.7371627502175807e-05, "loss": 2.1114, "step": 1040 }, { "epoch": 1.370757180156658, "grad_norm": 23.869230270385742, "learning_rate": 2.7154046997389038e-05, "loss": 2.1335, "step": 1050 }, { "epoch": 1.3838120104438643, "grad_norm": 23.66261100769043, "learning_rate": 2.6936466492602262e-05, "loss": 2.0774, "step": 1060 }, { "epoch": 1.3968668407310705, "grad_norm": 44.03451156616211, "learning_rate": 2.6718885987815496e-05, "loss": 2.6329, "step": 1070 }, { "epoch": 1.4099216710182767, "grad_norm": 20.849573135375977, "learning_rate": 2.650130548302872e-05, "loss": 2.8549, "step": 1080 }, { "epoch": 1.4229765013054831, "grad_norm": 27.392250061035156, "learning_rate": 2.6283724978241952e-05, "loss": 2.4537, "step": 1090 }, { "epoch": 1.4360313315926894, "grad_norm": 25.04733657836914, "learning_rate": 2.606614447345518e-05, "loss": 1.9425, "step": 1100 }, { "epoch": 1.4490861618798956, "grad_norm": 32.247894287109375, "learning_rate": 2.584856396866841e-05, "loss": 2.0056, "step": 1110 }, { "epoch": 1.4621409921671018, "grad_norm": 32.498191833496094, "learning_rate": 2.5630983463881635e-05, "loss": 2.6457, "step": 1120 }, { "epoch": 1.475195822454308, "grad_norm": 26.728214263916016, "learning_rate": 2.5413402959094866e-05, "loss": 1.6954, "step": 1130 }, { "epoch": 1.4882506527415145, "grad_norm": 63.32724380493164, "learning_rate": 2.5195822454308094e-05, "loss": 2.5008, "step": 1140 }, { "epoch": 1.5013054830287205, "grad_norm": 24.65690803527832, "learning_rate": 2.4978241949521325e-05, "loss": 1.5901, "step": 1150 }, { "epoch": 1.514360313315927, "grad_norm": 19.37055206298828, "learning_rate": 2.4760661444734552e-05, "loss": 1.9548, "step": 1160 }, { "epoch": 1.5274151436031331, "grad_norm": 23.001806259155273, "learning_rate": 2.4543080939947783e-05, "loss": 2.382, "step": 1170 }, { "epoch": 1.5404699738903394, "grad_norm": 26.997100830078125, "learning_rate": 2.432550043516101e-05, "loss": 2.2834, "step": 1180 }, { "epoch": 1.5535248041775458, "grad_norm": 22.0489559173584, "learning_rate": 2.410791993037424e-05, "loss": 2.1768, "step": 1190 }, { "epoch": 1.566579634464752, "grad_norm": 29.986967086791992, "learning_rate": 2.389033942558747e-05, "loss": 2.0854, "step": 1200 }, { "epoch": 1.5796344647519582, "grad_norm": 34.0990104675293, "learning_rate": 2.3672758920800698e-05, "loss": 2.4429, "step": 1210 }, { "epoch": 1.5926892950391645, "grad_norm": 25.31661033630371, "learning_rate": 2.3455178416013925e-05, "loss": 2.3145, "step": 1220 }, { "epoch": 1.6057441253263707, "grad_norm": 30.300716400146484, "learning_rate": 2.3237597911227156e-05, "loss": 1.9357, "step": 1230 }, { "epoch": 1.6187989556135771, "grad_norm": 25.158327102661133, "learning_rate": 2.3020017406440384e-05, "loss": 2.3195, "step": 1240 }, { "epoch": 1.6318537859007833, "grad_norm": 33.35712432861328, "learning_rate": 2.280243690165361e-05, "loss": 2.4724, "step": 1250 }, { "epoch": 1.6449086161879896, "grad_norm": 22.938852310180664, "learning_rate": 2.258485639686684e-05, "loss": 2.3876, "step": 1260 }, { "epoch": 1.6579634464751958, "grad_norm": 26.44889259338379, "learning_rate": 2.236727589208007e-05, "loss": 2.1164, "step": 1270 }, { "epoch": 1.671018276762402, "grad_norm": 28.251296997070312, "learning_rate": 2.2149695387293298e-05, "loss": 2.3045, "step": 1280 }, { "epoch": 1.6840731070496084, "grad_norm": 28.00015640258789, "learning_rate": 2.193211488250653e-05, "loss": 1.9546, "step": 1290 }, { "epoch": 1.6971279373368147, "grad_norm": 20.0263729095459, "learning_rate": 2.171453437771976e-05, "loss": 2.2491, "step": 1300 }, { "epoch": 1.7101827676240209, "grad_norm": 23.335580825805664, "learning_rate": 2.1496953872932988e-05, "loss": 2.5543, "step": 1310 }, { "epoch": 1.723237597911227, "grad_norm": 25.97711181640625, "learning_rate": 2.1279373368146216e-05, "loss": 2.2486, "step": 1320 }, { "epoch": 1.7362924281984333, "grad_norm": 24.391855239868164, "learning_rate": 2.1061792863359443e-05, "loss": 2.3483, "step": 1330 }, { "epoch": 1.7493472584856398, "grad_norm": 29.249792098999023, "learning_rate": 2.0844212358572674e-05, "loss": 2.3703, "step": 1340 }, { "epoch": 1.762402088772846, "grad_norm": 20.213987350463867, "learning_rate": 2.0626631853785902e-05, "loss": 2.3178, "step": 1350 }, { "epoch": 1.7754569190600522, "grad_norm": 33.050018310546875, "learning_rate": 2.040905134899913e-05, "loss": 2.2622, "step": 1360 }, { "epoch": 1.7885117493472587, "grad_norm": 25.058115005493164, "learning_rate": 2.019147084421236e-05, "loss": 2.2577, "step": 1370 }, { "epoch": 1.8015665796344646, "grad_norm": 34.79226303100586, "learning_rate": 1.997389033942559e-05, "loss": 2.1586, "step": 1380 }, { "epoch": 1.814621409921671, "grad_norm": 38.56571578979492, "learning_rate": 1.9756309834638816e-05, "loss": 2.0911, "step": 1390 }, { "epoch": 1.8276762402088773, "grad_norm": 27.778825759887695, "learning_rate": 1.9538729329852047e-05, "loss": 2.2379, "step": 1400 }, { "epoch": 1.8407310704960835, "grad_norm": 27.160274505615234, "learning_rate": 1.9321148825065275e-05, "loss": 2.0847, "step": 1410 }, { "epoch": 1.85378590078329, "grad_norm": 26.11197853088379, "learning_rate": 1.9103568320278503e-05, "loss": 2.7021, "step": 1420 }, { "epoch": 1.866840731070496, "grad_norm": 28.448244094848633, "learning_rate": 1.8885987815491734e-05, "loss": 2.1596, "step": 1430 }, { "epoch": 1.8798955613577024, "grad_norm": 49.074729919433594, "learning_rate": 1.866840731070496e-05, "loss": 2.5753, "step": 1440 }, { "epoch": 1.8929503916449086, "grad_norm": 21.96980094909668, "learning_rate": 1.845082680591819e-05, "loss": 2.5118, "step": 1450 }, { "epoch": 1.9060052219321149, "grad_norm": 20.993181228637695, "learning_rate": 1.823324630113142e-05, "loss": 2.2332, "step": 1460 }, { "epoch": 1.9190600522193213, "grad_norm": 20.049209594726562, "learning_rate": 1.801566579634465e-05, "loss": 2.1748, "step": 1470 }, { "epoch": 1.9321148825065273, "grad_norm": 35.51521682739258, "learning_rate": 1.779808529155788e-05, "loss": 2.781, "step": 1480 }, { "epoch": 1.9451697127937337, "grad_norm": 25.36643409729004, "learning_rate": 1.7580504786771106e-05, "loss": 2.7489, "step": 1490 }, { "epoch": 1.95822454308094, "grad_norm": 38.09309387207031, "learning_rate": 1.7362924281984334e-05, "loss": 2.1326, "step": 1500 }, { "epoch": 1.9712793733681462, "grad_norm": 30.636632919311523, "learning_rate": 1.7145343777197565e-05, "loss": 2.2151, "step": 1510 }, { "epoch": 1.9843342036553526, "grad_norm": 27.038352966308594, "learning_rate": 1.6927763272410793e-05, "loss": 2.4523, "step": 1520 }, { "epoch": 1.9973890339425586, "grad_norm": 19.101573944091797, "learning_rate": 1.671018276762402e-05, "loss": 2.518, "step": 1530 }, { "epoch": 2.0, "eval_loss": NaN, "eval_runtime": 11.9494, "eval_samples_per_second": 113.897, "eval_steps_per_second": 14.31, "step": 1532 }, { "epoch": 2.010443864229765, "grad_norm": 28.463035583496094, "learning_rate": 1.649260226283725e-05, "loss": 1.8945, "step": 1540 }, { "epoch": 2.023498694516971, "grad_norm": 30.520097732543945, "learning_rate": 1.627502175805048e-05, "loss": 2.3685, "step": 1550 }, { "epoch": 2.0365535248041775, "grad_norm": 19.876482009887695, "learning_rate": 1.6057441253263707e-05, "loss": 2.1494, "step": 1560 }, { "epoch": 2.049608355091384, "grad_norm": 23.423219680786133, "learning_rate": 1.5839860748476938e-05, "loss": 1.9791, "step": 1570 }, { "epoch": 2.06266318537859, "grad_norm": 20.257450103759766, "learning_rate": 1.5622280243690166e-05, "loss": 2.2526, "step": 1580 }, { "epoch": 2.0757180156657964, "grad_norm": 47.68708038330078, "learning_rate": 1.5404699738903393e-05, "loss": 1.9962, "step": 1590 }, { "epoch": 2.0887728459530024, "grad_norm": 27.561660766601562, "learning_rate": 1.5187119234116623e-05, "loss": 1.6106, "step": 1600 }, { "epoch": 2.101827676240209, "grad_norm": 26.832944869995117, "learning_rate": 1.4969538729329852e-05, "loss": 1.5015, "step": 1610 }, { "epoch": 2.1148825065274153, "grad_norm": 45.05983352661133, "learning_rate": 1.475195822454308e-05, "loss": 1.8897, "step": 1620 }, { "epoch": 2.1279373368146213, "grad_norm": 19.24533462524414, "learning_rate": 1.4534377719756313e-05, "loss": 2.4187, "step": 1630 }, { "epoch": 2.1409921671018277, "grad_norm": 29.476770401000977, "learning_rate": 1.431679721496954e-05, "loss": 2.1733, "step": 1640 }, { "epoch": 2.1540469973890337, "grad_norm": 26.505355834960938, "learning_rate": 1.409921671018277e-05, "loss": 1.8432, "step": 1650 }, { "epoch": 2.16710182767624, "grad_norm": 28.1693058013916, "learning_rate": 1.3881636205395997e-05, "loss": 2.1419, "step": 1660 }, { "epoch": 2.1801566579634466, "grad_norm": 37.704498291015625, "learning_rate": 1.3664055700609227e-05, "loss": 2.7488, "step": 1670 }, { "epoch": 2.1932114882506526, "grad_norm": 22.38772964477539, "learning_rate": 1.3446475195822456e-05, "loss": 2.3531, "step": 1680 }, { "epoch": 2.206266318537859, "grad_norm": 22.58838653564453, "learning_rate": 1.3228894691035684e-05, "loss": 1.8415, "step": 1690 }, { "epoch": 2.2193211488250655, "grad_norm": 30.01149559020996, "learning_rate": 1.3011314186248913e-05, "loss": 2.0757, "step": 1700 }, { "epoch": 2.2323759791122715, "grad_norm": 23.964759826660156, "learning_rate": 1.2793733681462141e-05, "loss": 2.4956, "step": 1710 }, { "epoch": 2.245430809399478, "grad_norm": 33.133541107177734, "learning_rate": 1.257615317667537e-05, "loss": 1.971, "step": 1720 }, { "epoch": 2.258485639686684, "grad_norm": 27.34188461303711, "learning_rate": 1.23585726718886e-05, "loss": 2.2084, "step": 1730 }, { "epoch": 2.2715404699738904, "grad_norm": 25.62513542175293, "learning_rate": 1.2140992167101827e-05, "loss": 1.9445, "step": 1740 }, { "epoch": 2.2845953002610964, "grad_norm": 33.618385314941406, "learning_rate": 1.1923411662315057e-05, "loss": 2.4505, "step": 1750 }, { "epoch": 2.297650130548303, "grad_norm": 25.787757873535156, "learning_rate": 1.1705831157528286e-05, "loss": 2.748, "step": 1760 }, { "epoch": 2.3107049608355092, "grad_norm": 24.533018112182617, "learning_rate": 1.1488250652741515e-05, "loss": 1.496, "step": 1770 }, { "epoch": 2.3237597911227152, "grad_norm": 41.41205596923828, "learning_rate": 1.1270670147954745e-05, "loss": 2.5213, "step": 1780 }, { "epoch": 2.3368146214099217, "grad_norm": 20.163238525390625, "learning_rate": 1.1053089643167972e-05, "loss": 1.8429, "step": 1790 }, { "epoch": 2.349869451697128, "grad_norm": 36.58127975463867, "learning_rate": 1.0835509138381202e-05, "loss": 2.617, "step": 1800 }, { "epoch": 2.362924281984334, "grad_norm": 29.932636260986328, "learning_rate": 1.061792863359443e-05, "loss": 2.2462, "step": 1810 }, { "epoch": 2.3759791122715406, "grad_norm": 20.780025482177734, "learning_rate": 1.0400348128807659e-05, "loss": 1.7878, "step": 1820 }, { "epoch": 2.3890339425587466, "grad_norm": 26.663557052612305, "learning_rate": 1.0182767624020888e-05, "loss": 2.2757, "step": 1830 }, { "epoch": 2.402088772845953, "grad_norm": 34.194435119628906, "learning_rate": 9.965187119234116e-06, "loss": 1.8959, "step": 1840 }, { "epoch": 2.4151436031331595, "grad_norm": 13.468405723571777, "learning_rate": 9.747606614447347e-06, "loss": 2.2176, "step": 1850 }, { "epoch": 2.4281984334203655, "grad_norm": 36.56803512573242, "learning_rate": 9.530026109660575e-06, "loss": 2.2373, "step": 1860 }, { "epoch": 2.441253263707572, "grad_norm": 25.580917358398438, "learning_rate": 9.312445604873804e-06, "loss": 2.004, "step": 1870 }, { "epoch": 2.454308093994778, "grad_norm": 47.98051071166992, "learning_rate": 9.094865100087033e-06, "loss": 2.09, "step": 1880 }, { "epoch": 2.4673629242819843, "grad_norm": 44.3414421081543, "learning_rate": 8.877284595300261e-06, "loss": 2.4412, "step": 1890 }, { "epoch": 2.480417754569191, "grad_norm": 22.451644897460938, "learning_rate": 8.65970409051349e-06, "loss": 1.5646, "step": 1900 }, { "epoch": 2.493472584856397, "grad_norm": 21.15165138244629, "learning_rate": 8.442123585726718e-06, "loss": 1.5899, "step": 1910 }, { "epoch": 2.506527415143603, "grad_norm": 49.44068145751953, "learning_rate": 8.224543080939948e-06, "loss": 2.0553, "step": 1920 }, { "epoch": 2.5195822454308097, "grad_norm": 30.540063858032227, "learning_rate": 8.006962576153177e-06, "loss": 1.7215, "step": 1930 }, { "epoch": 2.5326370757180157, "grad_norm": 29.058853149414062, "learning_rate": 7.789382071366406e-06, "loss": 1.7025, "step": 1940 }, { "epoch": 2.5456919060052217, "grad_norm": 36.38139343261719, "learning_rate": 7.571801566579635e-06, "loss": 1.9976, "step": 1950 }, { "epoch": 2.558746736292428, "grad_norm": 21.742773056030273, "learning_rate": 7.354221061792864e-06, "loss": 1.8528, "step": 1960 }, { "epoch": 2.5718015665796345, "grad_norm": 22.30496597290039, "learning_rate": 7.136640557006093e-06, "loss": 1.726, "step": 1970 }, { "epoch": 2.584856396866841, "grad_norm": 28.57794761657715, "learning_rate": 6.919060052219321e-06, "loss": 1.9179, "step": 1980 }, { "epoch": 2.597911227154047, "grad_norm": 35.680999755859375, "learning_rate": 6.70147954743255e-06, "loss": 1.9789, "step": 1990 }, { "epoch": 2.6109660574412534, "grad_norm": 41.53910827636719, "learning_rate": 6.483899042645779e-06, "loss": 2.3311, "step": 2000 } ], "logging_steps": 10, "max_steps": 2298, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2103832360255488.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }